import os, re, time, sys from glob import glob from htmlentitydefs import entitydefs from xml.sax.saxutils import escape import urlparse from config import directory, channel, urlpath import entry source=directory.data target=directory.atom import libxml2 foaf = {'name':'name', 'url':'homepage', 'email':'mbox'} splitExcerpt=re.compile('(?:
(?:(.*?)
)?\s*)?(.*)',re.S) splitEntity=re.compile('&(\w+);') post=re.compile(r'(?:
\s
\s)?Posted by ') api=re.compile(r'(?:

\s)?Message from ') tb=re.compile(r'(?:\[more\])?

Trackback from ') pb=re.compile(r'(?:

)?Pingback from ') excerpt=re.compile(r'\n

Excerpt from ') seen=re.compile(r'\n

Seen on ') email=re.compile(r'\n+

Emailed by ') cite1=re.compile(r'([^<]+):\s*(?=)') cite2=re.compile(r'\[via\s+([^<]*)\]') anchor=re.compile(r'(.*)$') attrs=re.compile(r'(\w+)="(.*?)"') spaces=re.compile(r'\s+') def iso8601(date): base=time.strftime("%Y-%m-%dT%H:%M:%S",date) return base + '%+.2d:00'%(-(date[-1] and time.altzone or time.timezone)/3600) def html2xml(html): if type(html) == str: html=html.decode('iso-8859-1') # unadorned ampersand html=html.replace('& ','& ') # demoronize html=html.replace(u'\x85','…') html=html.replace(u'\x91','‘') html=html.replace(u'\x92','’') html=html.replace(u'\x93','“') html=html.replace(u'\x94','”') html=html.replace(u'\x96','–') html=html.replace(u'\x97','—') # convert high bit characters to numeric equivalents for i in range(len(html)-1,-1,-1): if ord(html[i])>=128: html = '%s&#%d;%s' % (html[:i], ord(html[i]), html[i+1:]) # evaluate entitydefs chunks=splitEntity.split(html) for i in range(1,len(chunks),2): if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']: chunks[i] ='&' + chunks[i] +';' elif chunks[i] in entitydefs: chunks[i]=entitydefs[chunks[i]] if len(chunks[i])==1 and ord(chunks[i])>128: chunks[i]='&#%d;' % ord(chunks[i]) else: chunks[i] ='&' + chunks[i] +';' data=str("".join(chunks)) return data xentry_s=""" tag:intertwingly.net,2004:%(id)s %(title)s %(summary)s %(content)s %(issued)s """ xentry=''.join(xentry_s.split(" %(summary)s\n")) selflink = '' def shorten(url): base = selflink.split('/') if base[-1]: base[-1] = '.' url = url.split('/') for i in range(0,min(len(url),len(base))): if base[i] != url[i]: if i == len(url)-1 and not url[-1]: url[-1] = '' result = ['..'] * (len(base)-i-1) + url[i:] if i<=2: result = url if i==4: result = [urlpath.cache] + url[i:] if result == ['']: result = ['.'] break else: if len(url) < len(base): result = ['..'] * (len(base)-len(url)) else: result = url[len(base):] return '/'.join(result) def format(source, id, link, info): file=open(source) title=file.readline().strip().decode('utf-8') body=file.read().decode('utf-8') file.close() try: title=title.decode('utf-8') body=body.decode('utf-8') except: pass (updated,summary,content)=splitExcerpt.match(body).groups() stat=os.stat(source) issued=iso8601(time.localtime(stat.st_mtime)) modified=iso8601(time.localtime(stat.st_ctime)) if content.startswith('')>0: from get import resize svg=resize(content[:content.find('')+6], feed=True) content = (svg[:4] + ' style="float:right"' + svg[4:] + content[content.find('')+6:]) if title.startswith('","") author={} if content.find(': ')>0: fields=cite1.split(content,1) while len(fields)>1: fields[2] = "%s" % fields[2].strip() if fields[2].find('\n')>0: fields[2] = "\n" + fields[2].replace('\n',' ') fields[0] += '%s: %s' % tuple(fields[1:4]) del fields[1:4] content=fields[0] if content.find('[via')>0: fields=cite2.split(content) while len(fields)>1: fields[2] = "%s" % fields[2] if fields[2].find('\n')>0: fields[2] = "\n" + fields[2].replace('\n',' ') fields[0] += '[via %s]%s' % tuple(fields[1:4]) del fields[1:4] content=fields[0] if content.find('Posted')>0: fields=post.split(content) if len(fields)==2: author['method']='form' (content, author['name']) = fields fields=anchor.match(author['name']) if fields: (fields,author['name'])=fields.groups() fields=dict(attrs.findall(fields)) if 'href' in fields: author['uri']=fields['href'] if 'class' in fields and fields['class'] == 'openid': if 'title' in fields: author['openid.server']=fields['title'] else: if 'title' in fields: author['ipaddr']=fields['title'] if not author and content.find('Excerpt')>0: fields=excerpt.split(content) if len(fields)==2: author['method']='excerpt' (content, fields) = fields fields=spaces.sub(' ',fields) fields=anchor.match(fields.strip()) (fields,author['title'])=fields.groups() fields=dict(attrs.findall(fields)) author['uri']=fields['href'] if not author and content.find('Trackback')>0: fields=tb.split(content) if len(fields)==3: author['method']='trackback' (content, author['uri'], author['title']) = fields if not author['uri']: del author['uri'] fields=anchor.match(author['title']) if fields: (fields,author['title'])=fields.groups() fields=dict(attrs.findall(fields)) if 'title' in fields: author['ipaddr']=fields['title'] if not author and content.find('Message')>0: fields=api.split(content) if len(fields)==2: author['method']='api' (content, fields) = fields fields=anchor.match(fields) (fields,author['name'])=fields.groups() fields=dict(attrs.findall(fields)) author['uri']=fields['href'] if not author and content.find('Pingback')>=0: fields=pb.split(content) if len(fields)==2: author['method']='pingback' (content, author['title']) = fields fields=anchor.match(author['title']) if fields: (fields,author['title'])=fields.groups() fields=dict(attrs.findall(fields)) author['uri']=fields['href'] if not author and content.find('Emailed')>0: fields=email.split(content) if len(fields)==2: author['method']='email' (content, author['name']) = fields fields=anchor.match(author['name']) if fields: (fields,author['name'])=fields.groups() fields=dict(attrs.findall(fields)) author['uri']=fields['href'] author['smtppath']=fields['title'] if not author and content.find('Seen')>0: fields=seen.split(content) if len(fields)==2: author['method']='excerpt' (content, fields) = fields fields=anchor.match(fields) (fields,author['title'])=fields.groups() fields=dict(attrs.findall(fields)) author['uri']=fields['href'] if 'uri' in author and author['uri'].startswith('mailto:'): author['email']=author['uri'].split(':',1)[1] del author['uri'] try: libxml2.registerErrorHandler(lambda ctx,str: None, None) libxml2.parseDoc("%s" % content).freeDoc() if content.find('<')<0: content='%s'%content.strip().replace('&','&') else: content='
%s
'%content.strip() except libxml2.parserError,e: content='' + escape(content.strip()) + "" stype=ttype="" link = shorten(urlparse.urljoin(channel.link,link)) if title.find('<')>=0: ttype=' type="xhtml"' title='
%s
'%title if summary: summary=html2xml(summary) if summary.find('<')>=0: stype=' type="xhtml"' summary='
%s
'%summary entry = xentry_s % locals() else: entry = xentry % locals() if updated: if updated[-3].isdigit(): updated = updated[:-2]+':'+updated[-2:] entry = re.sub('(\s+)(.*?)', r'\1\2\1%s' % updated, entry) if author: xml=" \n" if 'title' in author and 'name' not in author: author['name'] = author['title'] del author['title'] for (key,value) in author.items(): if key in ['title', 'ipaddr', 'method', 'openid.server']: xml+=' <%s xmlns="http://www.intertwingly.net/blog/">%s\n' % (key,value,key) else: xml+=" <%s>%s\n" % (key,value,key) if not 'name' in author.keys(): xml+=" <%s>%s\n" % ('name','anonymous','name') xml+=" \n" elif info: xml="" for item in info: if item['filename'] == source: if item['lastcomment'] == 0: xml = ' \n' % (item['id'][1:], item['comments']) else: xml = ' \n' % (item['id'][1:], item['comments'], iso8601(time.localtime(item['lastcomment']))) else: xml="" return entry.replace(" \n",xml) def prev_month(selfdir, files): # find previous entry curr_time = os.stat(files[-1]).st_mtime prev_time = 0 for file in os.listdir('.'): if not file.endswith('.txt'): continue mtime = os.stat(file).st_mtime if mtime < curr_time: prev_time = max(prev_time, mtime) # format and return month = list(time.localtime(prev_time)[0:2]) if not prev_time: month = [0,0] archives = re.sub(r"/\d+/\d+/","/",selfdir) return '%s%.4d/%0.2d/' % (archives,month[0],month[1]) def atomize(list, file=None, info=None, selfdir=''): if not hasattr(list,'__setitem__'): id=list list=glob(source+id+"-*.cmt") list.sort() list.insert(0,source+id+".txt") if not file: file=open(target+id+".atom","w") info = None else: id='index' if len(list)>0: if list[0].find('.cmt')>0: id='comments' info = None elif list[-1].find('.cmt')>0 or len(list)==1: id=list[0].split('/')[-1].split('.')[0] info = None global selflink selflink = "%s%s%s.atom" % (channel.link, selfdir, id) hublink = 'http://pubsubhubbub.appspot.com/' if info: file.write('\n') else: file.write('\n') file.write(' \n' % selflink) if selflink == 'http://intertwingly.net/blog/index.atom': file.write(' \n' % hublink) if selflink == 'http://intertwingly.net/blog/comments.atom': file.write(' \n' % hublink) file.write(' %s%s%s.atom\n' % (channel.link, selfdir, id)) file.write(' %s\n' % shorten(channel.icon)) file.write('\n') file.write(' Sam Ruby\n') file.write(' %s\n' % html2xml(channel.description)) file.write(' \n') file.write(' Sam Ruby\n') file.write(' %s\n' % channel.author) file.write(' %s\n' % shorten(channel.link)) file.write(' \n') file.write(' %s\n' % iso8601(time.localtime())) file.write(' \n' % shorten(channel.link + selfdir)) if hasattr(channel, 'license'): file.write(' \n' % channel.license) if selfdir.startswith('archives/'): file.write('\n \n') prev_archive = channel.link + prev_month(selfdir,list) if not prev_archive.endswith('/0000/00/'): file.write(' \n' % (shorten(prev_archive), selflink.split('/')[-1])) current = urlparse.urljoin('/'.join(prev_archive.split('/')[0:-2]), selflink.split('/')[-1]) file.write(' \n' % shorten(current)) for name in list: id=name.split('/')[-1].split('.',1)[0] parts = id.split('-') link = entry.post(parts[0]).link() if len(parts) > 1: link+='#c'+str(int(os.stat(name).st_mtime)) file.write(format(name,id,link,info)) file.write("\n\n") return file if __name__ == "__main__": def validate(id): doc=libxml2.parseFile(target+id+".atom") ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs("xhtml","http://www.w3.org/1999/xhtml") ctxt.xpathRegisterNs("atom","http://www.w3.org/2005/Atom") for e in ctxt.xpathEval("//atom:entry[atom:content[@type='html']]"): ctxt.setContextNode(e) raise 'escaped: ' + ctxt.xpathEval("string(atom:id)") for e in ctxt.xpathEval("//atom:entry[contains(atom:id,'-') and not(atom:author)]"): ctxt.setContextNode(e) raise 'authorless: ' + ctxt.xpathEval("string(atom:id)") doc.freeDoc() for id in sys.argv[1:]: atomize(id).close() validate(id) if len(sys.argv) == 1: start=time.time() try: for txt in glob(source+'*.txt'): msg=[] id=txt.split('/')[-1].split('.')[0] atomize(id).close() libxml2.registerErrorHandler(lambda msg,txt: msg.append(txt), msg) validate(id) except: print '*** ' + id if msg: print ''.join(msg) raise print int(time.time())-start, "Seconds"