import random, re, sgmllib, string, urllib
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
random.seed()
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
class RSSParser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.tags = {}
self.estack = ['']
def unknown_starttag(self, tag, attrs):
colonpos = string.find(tag,':')
if colonpos <> -1: tag = tag[colonpos+1:]
if self.estack[-1] in ["image", "item", "textInput"]:
tag = self.estack[-1] + "." + tag
self.tags[tag] = 1
self.estack.append(tag)
def unknown_endtag(self, tag):
del self.estack[-1]
def dump(fname, tags):
print "dumping..."
file = open(fname, "w")
# sort by count descending, element name ascending
items = []
for k,v in tags.items():
items.append([-v,k])
items.sort()
file.write("
\n")
file.write("
Tag
Count
\n")
for item in items:
file.write("
%s
%s
\n" %
(item[1], -item[0]))
file.write("
\n")
file.close()
if __name__ == '__main__':
tags = {"channel":0}
count = 0
import sys
if sys.argv[1:]:
out = sys.argv[1]
else:
out = 'rsstags.html'
link_elements = re.compile("(.*?)")
url = 'http://www.syndic8.com/genfeed.php?Format=rss'
links = link_elements.findall(urllib.urlopen(url).read())
while links:
if tags["channel"] == 1000: break
link = random.choice(links)
links.remove(link)
if link == "": continue
link = string.replace(link, "&", "&");
try:
print link
# read the feed
r = RSSParser()
s = urllib.urlopen(link)
r.feed(s.read())
s.close()
# tally the elements
if not r.tags.has_key("channel"): raise IOError
for tag in r.tags.keys():
if not tags.has_key(tag): tags[tag]=0
tags[tag]=tags[tag]+1
if tags["channel"] % 10 == 0:
dump(out, tags)
count=0
except sgmllib.SGMLParseError:
pass
except timeoutsocket.Timeout:
pass
except IOError:
pass
dump(out, tags)