import random, re, sgmllib, string, urllib sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') random.seed() import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) class RSSParser(sgmllib.SGMLParser): def __init__(self): sgmllib.SGMLParser.__init__(self) self.tags = {} self.estack = [''] def unknown_starttag(self, tag, attrs): colonpos = string.find(tag,':') if colonpos <> -1: tag = tag[colonpos+1:] if self.estack[-1] in ["image", "item", "textInput"]: tag = self.estack[-1] + "." + tag self.tags[tag] = 1 self.estack.append(tag) def unknown_endtag(self, tag): del self.estack[-1] def dump(fname, tags): print "dumping..." file = open(fname, "w") # sort by count descending, element name ascending items = [] for k,v in tags.items(): items.append([-v,k]) items.sort() file.write("\n") file.write("\n") for item in items: file.write("\n" % (item[1], -item[0])) file.write("
TagCount
%s%s
\n") file.close() if __name__ == '__main__': tags = {"channel":0} count = 0 import sys if sys.argv[1:]: out = sys.argv[1] else: out = 'rsstags.html' link_elements = re.compile("(.*?)") url = 'http://www.syndic8.com/genfeed.php?Format=rss' links = link_elements.findall(urllib.urlopen(url).read()) while links: if tags["channel"] == 1000: break link = random.choice(links) links.remove(link) if link == "": continue link = string.replace(link, "&amp;", "&"); try: print link # read the feed r = RSSParser() s = urllib.urlopen(link) r.feed(s.read()) s.close() # tally the elements if not r.tags.has_key("channel"): raise IOError for tag in r.tags.keys(): if not tags.has_key(tag): tags[tag]=0 tags[tag]=tags[tag]+1 if tags["channel"] % 10 == 0: dump(out, tags) count=0 except sgmllib.SGMLParseError: pass except timeoutsocket.Timeout: pass except IOError: pass dump(out, tags)