Emailed by ')
cite1=re.compile(r'([^<]+):\s*(?=)')
cite2=re.compile(r'\[via\s+([^<]*)\]')
anchor=re.compile(r'(.*)$')
attrs=re.compile(r'(\w+)="(.*?)"')
spaces=re.compile(r'\s+')
def iso8601(date):
base=time.strftime("%Y-%m-%dT%H:%M:%S",date)
return base + '%+.2d:00'%(-(date[-1] and time.altzone or time.timezone)/3600)
def html2xml(html):
if type(html) == str: html=html.decode('iso-8859-1')
# unadorned ampersand
html=html.replace('& ','& ')
# demoronize
html=html.replace(u'\x85','…')
html=html.replace(u'\x91','‘')
html=html.replace(u'\x92','’')
html=html.replace(u'\x93','“')
html=html.replace(u'\x94','”')
html=html.replace(u'\x96','–')
html=html.replace(u'\x97','—')
# convert high bit characters to numeric equivalents
for i in range(len(html)-1,-1,-1):
if ord(html[i])>=128:
html = '%s%d;%s' % (html[:i], ord(html[i]), html[i+1:])
# evaluate entitydefs
chunks=splitEntity.split(html)
for i in range(1,len(chunks),2):
if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']:
chunks[i] ='&' + chunks[i] +';'
elif chunks[i] in entitydefs:
chunks[i]=entitydefs[chunks[i]]
if len(chunks[i])==1 and ord(chunks[i])>128:
chunks[i]='%d;' % ord(chunks[i])
else:
chunks[i] ='&' + chunks[i] +';'
data=str("".join(chunks))
return data
xentry_s="""
tag:intertwingly.net,2004:%(id)s%(title)s%(summary)s
%(content)s
%(issued)s
"""
xentry=''.join(xentry_s.split(" %(summary)s\n"))
selflink = ''
def shorten(url):
base = selflink.split('/')
if base[-1]: base[-1] = '.'
url = url.split('/')
for i in range(0,min(len(url),len(base))):
if base[i] != url[i]:
if i == len(url)-1 and not url[-1]: url[-1] = ''
result = ['..'] * (len(base)-i-1) + url[i:]
if i<=2: result = url
if i==4: result = [urlpath.cache] + url[i:]
if result == ['']: result = ['.']
break
else:
if len(url) < len(base):
result = ['..'] * (len(base)-len(url))
else:
result = url[len(base):]
return '/'.join(result)
def format(source, id, link, info):
file=open(source)
title=file.readline().strip().decode('utf-8')
body=file.read().decode('utf-8')
file.close()
try:
title=title.decode('utf-8')
body=body.decode('utf-8')
except:
pass
(updated,summary,content)=splitExcerpt.match(body).groups()
stat=os.stat(source)
issued=iso8601(time.localtime(stat.st_mtime))
modified=iso8601(time.localtime(stat.st_ctime))
if content.startswith('')+6], feed=True)
content = (svg[:4] + ' style="float:right"' + svg[4:] +
content[content.find('')+6:])
if title.startswith('","")
author={}
if content.find(': ')>0:
fields=cite1.split(content,1)
while len(fields)>1:
fields[2] = "%s" % fields[2].strip()
if fields[2].find('\n')>0: fields[2] = "\n" + fields[2].replace('\n',' ')
fields[0] += '%s: %s' % tuple(fields[1:4])
del fields[1:4]
content=fields[0]
if content.find('[via')>0:
fields=cite2.split(content)
while len(fields)>1:
fields[2] = "%s" % fields[2]
if fields[2].find('\n')>0: fields[2] = "\n" + fields[2].replace('\n',' ')
fields[0] += '[via %s]%s' % tuple(fields[1:4])
del fields[1:4]
content=fields[0]
if content.find('Posted')>0:
fields=post.split(content)
if len(fields)==2:
author['method']='form'
(content, author['name']) = fields
fields=anchor.match(author['name'])
if fields:
(fields,author['name'])=fields.groups()
fields=dict(attrs.findall(fields))
if 'href' in fields: author['uri']=fields['href']
if 'class' in fields and fields['class'] == 'openid':
if 'title' in fields: author['openid.server']=fields['title']
else:
if 'title' in fields: author['ipaddr']=fields['title']
if not author and content.find('Excerpt')>0:
fields=excerpt.split(content)
if len(fields)==2:
author['method']='excerpt'
(content, fields) = fields
fields=spaces.sub(' ',fields)
fields=anchor.match(fields.strip())
(fields,author['title'])=fields.groups()
fields=dict(attrs.findall(fields))
author['uri']=fields['href']
if not author and content.find('Trackback')>0:
fields=tb.split(content)
if len(fields)==3:
author['method']='trackback'
(content, author['uri'], author['title']) = fields
if not author['uri']: del author['uri']
fields=anchor.match(author['title'])
if fields:
(fields,author['title'])=fields.groups()
fields=dict(attrs.findall(fields))
if 'title' in fields: author['ipaddr']=fields['title']
if not author and content.find('Message')>0:
fields=api.split(content)
if len(fields)==2:
author['method']='api'
(content, fields) = fields
fields=anchor.match(fields)
(fields,author['name'])=fields.groups()
fields=dict(attrs.findall(fields))
author['uri']=fields['href']
if not author and content.find('Pingback')>=0:
fields=pb.split(content)
if len(fields)==2:
author['method']='pingback'
(content, author['title']) = fields
fields=anchor.match(author['title'])
if fields:
(fields,author['title'])=fields.groups()
fields=dict(attrs.findall(fields))
author['uri']=fields['href']
if not author and content.find('Emailed')>0:
fields=email.split(content)
if len(fields)==2:
author['method']='email'
(content, author['name']) = fields
fields=anchor.match(author['name'])
if fields:
(fields,author['name'])=fields.groups()
fields=dict(attrs.findall(fields))
author['uri']=fields['href']
author['smtppath']=fields['title']
if not author and content.find('Seen')>0:
fields=seen.split(content)
if len(fields)==2:
author['method']='excerpt'
(content, fields) = fields
fields=anchor.match(fields)
(fields,author['title'])=fields.groups()
fields=dict(attrs.findall(fields))
author['uri']=fields['href']
if 'uri' in author and author['uri'].startswith('mailto:'):
author['email']=author['uri'].split(':',1)[1]
del author['uri']
try:
libxml2.registerErrorHandler(lambda ctx,str: None, None)
libxml2.parseDoc("%s" % content).freeDoc()
if content.find('<')<0:
content='%s'%content.strip().replace('&','&')
else:
content='
%s
'%content.strip()
except libxml2.parserError,e:
content='' + escape(content.strip()) + ""
stype=ttype=""
link = shorten(urlparse.urljoin(channel.link,link))
if title.find('<')>=0:
ttype=' type="xhtml"'
title='
%s
'%title
if summary:
summary=html2xml(summary)
if summary.find('<')>=0:
stype=' type="xhtml"'
summary='
%s
'%summary
entry = xentry_s % locals()
else:
entry = xentry % locals()
if updated:
if updated[-3].isdigit(): updated = updated[:-2]+':'+updated[-2:]
entry = re.sub('(\s+)(.*?)',
r'\1\2\1%s' % updated, entry)
if author:
xml=" \n"
if 'title' in author and 'name' not in author:
author['name'] = author['title']
del author['title']
for (key,value) in author.items():
if key in ['title', 'ipaddr', 'method', 'openid.server']:
xml+=' <%s xmlns="http://www.intertwingly.net/blog/">%s%s>\n' % (key,value,key)
else:
xml+=" <%s>%s%s>\n" % (key,value,key)
if not 'name' in author.keys():
xml+=" <%s>%s%s>\n" % ('name','anonymous','name')
xml+=" \n"
elif info:
xml=""
for item in info:
if item['filename'] == source:
if item['lastcomment'] == 0:
xml = ' \n' % (item['id'][1:], item['comments'])
else:
xml = ' \n' % (item['id'][1:], item['comments'], iso8601(time.localtime(item['lastcomment'])))
else:
xml=""
return entry.replace(" \n",xml)
def prev_month(selfdir, files):
# find previous entry
curr_time = os.stat(files[-1]).st_mtime
prev_time = 0
for file in os.listdir('.'):
if not file.endswith('.txt'): continue
mtime = os.stat(file).st_mtime
if mtime < curr_time: prev_time = max(prev_time, mtime)
# format and return
month = list(time.localtime(prev_time)[0:2])
if not prev_time: month = [0,0]
archives = re.sub(r"/\d+/\d+/","/",selfdir)
return '%s%.4d/%0.2d/' % (archives,month[0],month[1])
def atomize(list, file=None, info=None, selfdir=''):
if not hasattr(list,'__setitem__'):
id=list
list=glob(source+id+"-*.cmt")
list.sort()
list.insert(0,source+id+".txt")
if not file: file=open(target+id+".atom","w")
info = None
else:
id='index'
if len(list)>0:
if list[0].find('.cmt')>0:
id='comments'
info = None
elif list[-1].find('.cmt')>0 or len(list)==1:
id=list[0].split('/')[-1].split('.')[0]
info = None
global selflink
selflink = "%s%s%s.atom" % (channel.link, selfdir, id)
hublink = 'http://pubsubhubbub.appspot.com/'
if info:
file.write('\n')
else:
file.write('\n')
file.write(' \n' % selflink)
if selflink == 'http://intertwingly.net/blog/index.atom':
file.write(' \n' % hublink)
if selflink == 'http://intertwingly.net/blog/comments.atom':
file.write(' \n' % hublink)
file.write(' %s%s%s.atom\n' % (channel.link, selfdir, id))
file.write(' %s\n' % shorten(channel.icon))
file.write('\n')
file.write(' Sam Ruby\n')
file.write(' %s\n' % html2xml(channel.description))
file.write(' \n')
file.write(' Sam Ruby\n')
file.write(' %s\n' % channel.author)
file.write(' %s\n' % shorten(channel.link))
file.write(' \n')
file.write(' %s\n' % iso8601(time.localtime()))
file.write(' \n' % shorten(channel.link + selfdir))
if hasattr(channel, 'license'):
file.write(' \n' % channel.license)
if selfdir.startswith('archives/'):
file.write('\n \n')
prev_archive = channel.link + prev_month(selfdir,list)
if not prev_archive.endswith('/0000/00/'):
file.write(' \n' %
(shorten(prev_archive), selflink.split('/')[-1]))
current = urlparse.urljoin('/'.join(prev_archive.split('/')[0:-2]), selflink.split('/')[-1])
file.write(' \n' % shorten(current))
for name in list:
id=name.split('/')[-1].split('.',1)[0]
parts = id.split('-')
link = entry.post(parts[0]).link()
if len(parts) > 1: link+='#c'+str(int(os.stat(name).st_mtime))
file.write(format(name,id,link,info))
file.write("\n\n")
return file
if __name__ == "__main__":
def validate(id):
doc=libxml2.parseFile(target+id+".atom")
ctxt = doc.xpathNewContext()
ctxt.xpathRegisterNs("xhtml","http://www.w3.org/1999/xhtml")
ctxt.xpathRegisterNs("atom","http://www.w3.org/2005/Atom")
for e in ctxt.xpathEval("//atom:entry[atom:content[@type='html']]"):
ctxt.setContextNode(e)
raise 'escaped: ' + ctxt.xpathEval("string(atom:id)")
for e in ctxt.xpathEval("//atom:entry[contains(atom:id,'-') and not(atom:author)]"):
ctxt.setContextNode(e)
raise 'authorless: ' + ctxt.xpathEval("string(atom:id)")
doc.freeDoc()
for id in sys.argv[1:]:
atomize(id).close()
validate(id)
if len(sys.argv) == 1:
start=time.time()
try:
for txt in glob(source+'*.txt'):
msg=[]
id=txt.split('/')[-1].split('.')[0]
atomize(id).close()
libxml2.registerErrorHandler(lambda msg,txt: msg.append(txt), msg)
validate(id)
except:
print '*** ' + id
if msg: print ''.join(msg)
raise
print int(time.time())-start, "Seconds"