require 'open-uri' require 'rexml/document' require 'timeout' require 'uri' url = 'http://top500.feedster.com/top500.opml' outline = // link = /<\s*link\s+(.*?)>/m feedtypes = %w(application/atom+xml application/rss+xml application/rdf+xml) cats = {:none => "Do not have autodiscovery", :match => "top500 references the sites indicated prefered feed", :found => "top500 references a feed mentioned in autodiscovery other than the preferred one", :different => "top500 references a feed not mentioned in autodiscovery", :defunct => "unable to fetch this page"} bin = {} cats.keys.each {|cat| bin[cat] = []} class String def escape REXML::Text.normalize self end end class URI::HTTP def match(uri) self.to_s.sub('http://www.','http://') == uri.sub('http://www.','http://') end end count = 0 open(url).each_line { |line| attrs = outline.match(line) next unless attrs title, description, xmlUrl, htmlUrl = attrs.to_a[2..-1] count += 1 puts "#{count}: #{title}" STDOUT.flush if description.empty? entry="#{title.escape}" else entry="#{title.escape}" end entry = entry + " (top500)" begin handle = open(htmlUrl) rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED begin puts "retrying..." sleep 5 handle = open(htmlUrl) rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED puts "retrying..." sleep 60 begin handle = open(htmlUrl) rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED bin[:defunct].push entry next # give up end end end feeds = [] handle.read.scan(link).each do |match| attrs=match[0].scan(/(\w+)="(.*?)"/) + match[0].scan(/(\w+)='(.*?)'/) attrs=Hash[*attrs.flatten] if attrs['rel'] and attrs['rel'].downcase.split.include? "alternate" if attrs['type'] and feedtypes.include? attrs['type']: feeds.push URI.join(xmlUrl, attrs['href'].strip) if attrs['href'] end end end if feeds.length == 0 bin[:none].push entry elsif feeds[0].match(xmlUrl) bin[:match].push entry elsif feeds.find {|feed| feed.match(xmlUrl)} bin[:found].push entry + " (autodiscovery)" else bin[:different].push entry + " (autodiscovery)" end raise "hell" unless count == bin.values.inject(0) {|a,b| a+b.length} handle.close } open("top500.html","w") do |output| output.puts "" output.puts "" output.puts "Feedster top500, categorized" output.puts "" output.puts "" [:none, :match, :found, :different, :defunct].each do |name| output.puts "

#{cats[name].escape}

" output.puts "(count: #{bin[name].length})" output.puts "" end output.puts "" output.puts "" end