require 'rexml/document' # some useful constants now = Time.now.to_i week_ago = now - 7 * 86400 week = 7 * 86400.0 # here's where what we accumulate all_links = {} for file in Dir['/home/rubys/planet/cache/*'] # ensure the entry isn't older than a week, or from Bob Sutor stat = File.stat(file) next if stat.directory? next if stat.mtime.to_i < week_ago next if file.include? 'sutor' # extract all hrefs, excluding source and self links entry_links = [] doc = REXML::Document.new File.new(file) for element in REXML::XPath.match(doc, '//*[@href]') next if element.parent.name == 'source' attrs = element.attributes next if attrs.has_key? 'rel' and attrs['rel'] == 'self' entry_links.push attrs['href'] end # grab the link to this entry source = REXML::XPath.first(doc, '//link[@rel="alternate"]') next unless source and source.attributes.has_key? 'href' source = source.attributes['href'] # add all unique links and weight to all_links weight = 1.0 - (now - stat.mtime.to_i)**2 / week**2 for link in entry_links.sort.uniq (all_links[link] ||= []) << [weight, source] end end # here's the magic that does the data reduction weighted_links = all_links.map{|k,v| [v.map{|w,s| w}.inject(0) {|a,b| a+b}, k]} weighted_links.sort!.reverse! # output the top 10, and who linked to it for weight, link in weighted_links[0..10] puts link for source in all_links[link].sort.reverse puts " #{source[1]}" end puts end