#!/usr/bin/python2.4 import sys trace=False if 'trace' in sys.argv: trace = True sys.argv.remove('trace') print "+++ begin trace" sys.stdout.flush() # import patch22 import patch25 from xml.dom import minidom import os, re, sys, sgmllib, time, urllib, urlparse from xml.sax.saxutils import escape from atomef import unescape atomns = 'http://www.w3.org/2005/Atom' import technorati from config import directory from post import writeComment, existingBacklink, sanitize from entry import post try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except ImportError: pass if os.getcwd() not in sys.path: sys.path.insert(0, os.getcwd()) newrefs=os.path.join(os.path.expanduser('~'),'.mltfo') newlastupdatetechno=os.path.join(os.path.expanduser('~'),'.techno') os.chdir(directory.log) common_feed_names = ['atom.xml', 'rss.xml', 'index.xml', 'index.rdf', '?flav=rss', 'backend.php', 'GetRss?', 'blogger_rss.xml', 'rss'] # parse a html page, looking for feeds class html(sgmllib.SGMLParser): def __init__(self, url): self.feedurl = None self.urlbase = url[:url.rfind('/',8)+1].lower() self.intitle = False self.title = "" sgmllib.SGMLParser.__init__(self) try: if url: self.feed(urllib.URLopener().open(url).read()) except: pass # def start_link(self, attrs): attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) if not 'rel' in attrs: return rels = attrs['rel'].split(' ') if 'alternate' not in rels: return if not 'type' in attrs.keys() or not attrs['type'].endswith('xml'): return if 'href' in attrs: if not self.feedurl: self.feedurl = attrs['href'] # def start_a(self, attrs): if self.feedurl: return attrs = dict(map(lambda (k,v): (k.lower(),v.lower()), attrs)) if 'href' in attrs: href = attrs['href'] if self.urlbase == href[:href.rfind('/',8)+1]: if href[href.rfind('/',8)+1:] in common_feed_names: self.feedurl = href # title def do_title(self, attrs): if self.title=="": self.intitle=1 def unknown_starttag(self, tag, attrs): self.intitle=0 def unknown_endtag(self,tag): self.intitle=0 def handle_charref(self, ref): if self.intitle: self.title = self.title + ("&#%s;" % ref) def handle_data(self,text): if self.intitle: self.title = self.title + text # return the text associated with a given DOM node def text(element, tag): nodes = element.getElementsByTagName(tag) if not nodes: nodes=element.getElementsByTagNameNS(atomns, tag) if not nodes: return "" attrs=dict(nodes[0].attributes) if 'mode' in attrs and attrs['mode'].value=='xml': return innerxml(element, nodes[0].namespaceURI, tag) elif 'type' in attrs.keys() and attrs['type'].value.find('xhtml')>=0: return innerxml(element, nodes[0].namespaceURI, tag) elif 'type' in attrs.keys() and attrs['type'].value.find('html')>=0: return unescape("".join([getattr(child,'data','') for child in nodes[0].childNodes])) elif tag in ['content','summary'] and ('type' not in attrs or attrs['type'].value in ['xhtml','plain']): return innerxml(element, nodes[0].namespaceURI, tag) else: return "".join([getattr(child,'data','') for child in nodes[0].childNodes]) # return the innerxml associated with a given DOM node def innerxml(element, ns, tag): nodes = element.getElementsByTagNameNS(ns, tag) if not nodes: return "" if len(nodes[0].childNodes)==1 and nodes[0].childNodes[0].nodeName=='div': nodes = nodes[0].childNodes value=nodes[0].toxml() return value[value.find('>')+1:value.rfind('<')] def extract(entry, base=''): attrs=dict(entry.attributes) if 'xml:base' in attrs: if base: base=urlparse.urljoin(base,attrs['xml:base'].value) else: base = attrs['xml:base'] title=text(entry,'title') ref='' alternate = None if 'rdf:about' in dict(entry.attributes): alternate=dict(entry.attributes)['rdf:about'].value if not alternate: guid=entry.getElementsByTagName('guid')[:1] if guid and guid[0].getAttribute('isPermaLink') in ('','true'): alternate=text(entry,'guid') for link in entry.getElementsByTagNameNS(entry.namespaceURI, 'link'): attrs=dict(link.attributes) # print [(key, attrs[key].value) for key in attrs.keys()] if (not 'rel' in attrs) or attrs['rel'].value=='alternate': if (not 'type' in attrs) or attrs['type'].value.find('html')>=0: if 'href' in attrs: alternate=alternate or attrs['href'].value ref = ref or text(entry,'link') alternate = alternate or ref if alternate and base: alternate=urlparse.urljoin(base,alternate) if alternate and alternate.startswith("http://blogdex.net/route.asp?"): alternate=alternate.replace("/route.asp?","/track.asp?") if base.startswith("http://del.icio.us/") and base.find("/inbox/")<0: if base<>'http://del.icio.us/rss' and alternate.find("intertwingly")>=0: ref=alternate from md5 import md5 alternate="http://del.icio.us/url/%s" % md5(alternate).hexdigest() if alternate and alternate.startswith("http://programming.reddit.com/goto?rss=true&id="): id = alternate.split('=',2)[2] alternate = 'http://programming.reddit.com/info/%s/comments' % id if alternate and alternate.startswith("http://www.reddit.com/"): names=alternate.split('/') if len(names)>6 and names[5] == 'comments': alternate = 'http://%s.reddit.com/info/%s/comments' % (names[4], names[6]) summary=text(entry,'summary') or text(entry,'description') content=(text(entry,'content') or innerxml(entry,'http://www.w3.org/1999/xhtml','body') or text(entry,'content:encoded') or summary) if ref: content = content + " " + ref if base and base.startswith("http://archipelago.phrasewise.com"): match=re.compile('\[(.*)\].* time.strftime('%Y%m%d'): scan.append((time.strftime('%Y%m%d'),'00:00:00')) # logfile pattern pattern = re.compile(r'(.*?) .*? \[(.*?)\] "(.*?)" (.*?) (\S+) "(.*?)" "(.*?)"') # ip timestamp url status size referer browser # scan referers for references to specific blog entries if trace: print "+++ collect referers" referers = sys.argv[1:] for (logfile,bookmark) in scan: if os.path.exists(logfile + ".log.gz"): import gzip file = gzip.open(logfile + ".log.gz") else: if not os.path.exists(logfile + ".log"): continue file = open(logfile + ".log") if trace: print "... " + logfile entrycache = {} for line in file.readlines(): cursor = line.find(':') if bookmark > line[cursor+1:cursor+9]: continue if line.find(' 200 ')<0: continue (ip,ts,url,status,size,refer,browser) = pattern.search(line).groups() if status<>'200': continue if refer=='-': continue if refer.startswith('.'): continue if refer.find('/search?')>0: continue if refer.find('intertwingly.net')>0: continue if refer.find('bloglines.com/myblogs_display')>0: continue if refer.find('thauvin.net')>0: continue if refer.find('diveintomark.blogspot.com')>0: continue if refer.find('bolli.homeip.net')>0: continue if refer.find('fozbaca.org/blagg')>0: continue if refer.find('feeds.diveintomark.org')>0: continue if refer.find('/aggsome.cgi/')>0: continue if refer.find('20six.co.uk')>0: continue if refer.find('/mediajunkie.com/')>0: continue if refer.find('/treesalive.com/')>0: continue if refer.find('automated.adsensemoney.net')>0: continue if refer.find('dcostanet.net/rss')>0: continue url = url.split(' ')[1] url = url.split('#')[0] url = url.split('?')[0] try: entry=entrycache[url] except: entry=re.match('/blog\/(\d+)\.',url) if entry: entry = entry.group(1) else: entry=post(url).id() if not entry or not entry.isdigit(): continue entrycache[url]=entry refer = refer.split('#')[0] refer = refer.split('?')[0] if refer[0:2]=='//': refer='http:'+refer if trace: print "??? %s %s" % (entry, refer) if refer[0:7]=='http://': if not refer in referers: if not existingBacklink(entry, refer): referers.append(refer) # scan each unique referer for feeds if trace: print "+++ fetch referers" os.chdir(directory.data) feeds = [] for refer in referers: if refer.startswith('http://www.google.'): continue if refer.startswith('http://search.msn.com'): continue if refer.startswith('http://search.live.com'): continue if refer.startswith('http://xmlns.com/foaf'): continue if refer.find('8z21-7pie.blogspot.com')>0: continue if refer.find('correctserver.com')>0: continue if refer.find('getfirefoxbrowsers.com')>0: continue if refer.find('javablogs.com/Jump')>0: continue if trace: print refer try: feedurl = html(refer).feedurl if feedurl.startswith('feed://'): feedurl="http" + feedurl[4:] if feedurl.find('ken.coar.org/blog/index.rss')>0: feedurl+='?words=all&sanitise=false' if not feedurl: continue if feedurl.find('rr.bloghackers.net')>0: continue if feedurl.find('intertwingly.net/blog/index.rss')>0: print refer # resolve relative urls feedurl = urlparse.urljoin(refer,feedurl) if not feedurl in feeds: feeds.append(feedurl) if feedurl=='http://weblog.philringnalda.com/comments/feed/': feedurl='http://weblog.philringnalda.com/feed/' if not feedurl in feeds: feeds.append(feedurl) except: pass # add in technorati found links since last scan if trace: print "+++ fetch technorati" try: items=technorati.getCosmos('http://www.intertwingly.net')['inbound'] except: items=[] for item in items: if item['linkcreated']>lastTR and item['weblog'].has_key('rssurl'): rssurl=item['weblog']['rssurl'] if rssurl.find('thauvin.net')>0: continue if not rssurl: rssurl=html(item['weblog']['url']+'/').feedurl if rssurl and rssurl.find('/mediajunkie.com/')>0: continue if trace and rssurl: print rssurl if rssurl and not rssurl in feeds: feeds.append(rssurl) lastTR=max([item['linkcreated'] for item in items]+[lastTR]) # scan the unique rss feeds encountered if trace: print "+++ fetch feeds" it_href = re.compile('intertwingly.net/blog/([-a-zA-Z0-9/]+)') slides_href = re.compile('intertwingly.net/slides/([0-9]+/[-a-zA-Z0-9]+)') stories_href = re.compile('intertwingly.net/stories/([0-9]+/[0-9]+/[0-9]+/[-.a-zA-Z0-9]+)') for url in feeds: url=url.replace('&','&') if trace: print url try: data = urllib.urlopen(url).read() if not data: continue feed = minidom.parseString(data) blog=text(feed,'title') or html(text(feed,'link')).title attrs=dict(feed.documentElement.attributes) if 'xml:base' in attrs: base=urlparse.urljoin(url,attrs['xml:base'].value) else: base=url entries = feed.getElementsByTagNameNS(atomns, 'entry') entries = entries or feed.getElementsByTagName('item') for entry in entries: try: (title, alternate, summary, content) = extract(entry, base) if not alternate: continue if text(entry, 'id').startswith('tag:planet.intertwingly.net'): continue for target in it_href.findall(content)+slides_href.findall(content)+stories_href.findall(content): if target.endswith('/'): target=target[:-1] if not target.isdigit(): if target == '2005/xmlconf': target="2117" elif target == '2004/devcon': target="1868" elif target == '2005/08/09/rails_example.rb': target="2046" elif target == '2005/etcon': target="1926" elif target == '2005/fosssl/keynote.html': target="2060" elif target == '2005/rs': target="1945" elif target == '2006/07/30/expatparser.rb': target="2361" elif target == '2006/AtomInASeaOfRSS2': target="2381" elif target == '2006/AtomInASeaOfRSS': target="2230" elif target == '2006/etcon': target="2183" elif target == '2006/npuc': target="2355" elif target == '2007/05/02/msft.html4': target="2558" elif target == '2007/05/02/msft.html': target="2558" elif target == '2007/09/11/toucan.html': target="2662" elif target == '2008/oscon': target="2874" else: target=post(target).id() if not target: continue if trace: print target if target.startswith('index'): continue if target.startswith('comments'): continue alternate=escape(alternate) if alternate.find('intertwingly.net/blog')>0: continue if alternate.find('javablogs.com/Jump')>0: continue if not existingBacklink(target,alternate): # remove html and truncate excerpt description to 250 chars edesc=re.compile(u'<.*?>',re.S).sub(' ',summary or content) edesc=re.sub('\s+',' ',edesc) if len(edesc)>250: edesc=edesc[:edesc.rfind(' ',0,250)][:250] source = entry.getElementsByTagNameNS(atomns, 'source') if source: source = text(source[0],'title') if not os.path.exists(directory.data + target + ".txt"): continue # write out the excerpt if alternate.startswith("http://") or alternate.startswith("https://"): writeComment(target, escape(title.strip()), '%s...\n

Excerpt from
%s\n' % (sanitize(unescape(edesc)), alternate, escape(source or blog))) except: import traceback, sys print "".join(apply(traceback.format_exception, sys.exc_info())) print url try: print title print alternate print content except: pass print except: import traceback, sys print "".join(apply(traceback.format_exception, sys.exc_info())) print url print # mark the point at which the next scan is to start if trace: print "+++ write bookmark" file=open(newrefs,'w') file.write(start+lastTR+'\n') file.close()