Ruby Buzz Forum - Extracting Tech Memeorandum's blog list

 #! /usr/bin/ruby
 # tmparse.rb
 # Extract the blog citations from the home page of
 # http://tech.memeorandum.com.
 #
 # Copyright (C) 2006 Adam Green
 # http://ruby.darwinianweb.com, adam AT darwinianweb DOT com
 # This program is distributed under the same license as Ruby.
 #
 # Each blog is identified in the page with the following entry:
 # <CITE>First Last / <A HREF="http://url/">Blog Name</A>:</CITE>
 # If there is no author's name, the citation is:
 # <CITE> <A HREF="http://url/">Blog Name</A>:</CITE>
 


 # Get the page's text.
 require "open-uri"
 page = open("http://tech.memeorandum.com/")
 pagetext = page.read
 page.close
 


 # Convert ellipse entity used by TM, since it gives XML parsers fits.
 pagetext = pagetext.gsub("&hellip;", "...")
 


 # Pull out all the citations.
 citelist = pagetext.scan(/<cite>.*?<\/cite>/i)
 


 # Build a hash with them.
 sortlist = {}
 citelist.each do |citation|
   # Only use citations with URLs.
   if citation.match(/a href/i)
 


     htmlurlstart = citation.index('="')+2
     htmlurlend = citation.index('">')-1
     htmlurl = citation[htmlurlstart..htmlurlend]
 


     titlestart = htmlurlend+3
     titleend = citation.index('</A>')-1
     title = citation[titlestart..titleend]
 


     # Does the citation include an author?
     if citation.index("/") < citation.index("<A HREF")
       authorstart = 6
       authorend = citation.index("/")-2
       author = citation[authorstart..authorend]
       author = author.strip
       author = author.squeeze(" ")
       sortkey = author
     else
       author = ""
       sortkey = title
     end
 


     # Build the hash, so it can be sorted on author or title
     sortlist[sortkey.upcase] = { "author" => author,
       "htmlurl" => htmlurl,
       "title" => title }
   end
 end
 


 # Write the sorted list out to an XML file.
 xmlfile = File.new("../../projects/tmblogs/tmblogs.xml", "w")
 xmlfile.puts('<?xml version="1.0" encoding="utf-8" ?>')
 xmlfile.puts('<tmblogs>')
 


 # Hash#sort returns an array.
 sortarray = sortlist.sort
 sortarray.each do |item|
   info = item[1]
   xmlfile.puts('  <blog>')
   xmlfile.puts('    <author>' + info["author"] + '</author>')
   xmlfile.puts('    <title>' + info["title"] + '</title>')
   xmlfile.puts('    <htmlUrl>' + info["htmlurl"] + '</htmlUrl>')
   xmlfile.puts('  </blog>')
 end
 xmlfile.puts('</tmblogs>')
 xmlfile.close

	Web Artima.com