Ruby Buzz Forum - Automatic OPML reading list from Tech Memeorandum

 #! /usr/bin/ruby
 # tmopml.rb
 # Create an OPML reading list based on a list of
 # blogs cited on http://tech.memeorandum.com.
 #
 # Copyright (C) 2006 Adam Green
 # http://mashup.darwinianweb.com, adam AT darwinianweb DOT com
 # This program is distributed under the same license as Ruby.
 #
 require "open-uri"
 require "rexml/document"
 include REXML
 


 # Create the OPML file.
 opmlfile = File.new("public_html/projects/tmblogs/tmopml.xml", "w")
 opmlfile.puts('<?xml version="1.0" encoding="UTF-8"?>')
 opmlfile.puts('<opml version="1.1">')
 opmlfile.puts('  <head>')
 opmlfile.puts('    <title>Tech Memeorandum Reading List</title>')
 opmlfile.puts('    <dateCreated>' + Time.now.rfc2822 + '</dateCreated>')
 opmlfile.puts('    <ownerName>Adam Green - Mashup.Darwinianweb.com</ownerName>')
 opmlfile.puts('  </head>')
 opmlfile.puts('  <body>')
 


 # Open the list of blogs maintained at
 # http://mashup.darwinianweb.com/projects/tmblogs/tmblogs.xml
 doc = Document.new(File.read("public_html/projects/tmblogs/tmblogs.xml"))
 doc.elements.each("tmblogs/blog") do |blog|
   title = blog.elements["title"].text
   htmlurl = blog.elements["htmlUrl"].text
 


   begin
     # Get the blog page's text.
     page = open(htmlurl)
     pagetext = page.read
     page.close
 


     # Pull out all the link tags.
     feedfound = false
     pagetext.scan(/<link.*?>/i).each do |tag|
 


       # clean up the tag.
       tag = tag.delete(" ")
       tag = tag.downcase
 


       # Find the first feed link.
       if tag.match(('rel=\"alternate\"') &&
         ('application\/rss\+xml'||'text\/xml'||'application\/atom\+xml'||'application\/x.atom\+xml'||'application\/x-atom\+xml') ) &&
         (not feedfound)
 


         # Extract the feed's URL
         matchdata = /href=\".*?\"/.match(tag)
         matchstr = matchdata.to_s
         xmlurl = matchstr[6..matchstr.length-2]
         feedfound = true
         puts title, htmlurl
         opmlfile.puts('    <outline type="rss" text="' + title + '" xmlUrl="' + xmlurl + '" htmlUrl="' + htmlurl + '" />')
 


       end
     end
 


     * Trap time out errors.
     rescue Exception
     puts title, 'timeout'
   end
 end
 


 opmlfile.puts('  </body>')
 opmlfile.puts('</opml>')
 opmlfile.close

	Web Artima.com