a little scraper rake task

20 August 2009 | 1:16 pm by codeboxer

Written for no good reason, mostly just to play with hpricot. Good times.

desc 'Scrape away.'

require 'config/boot'
require 'config/environment'
require 'application'
require 'net/ftp'
#include Utils
require 'rubygems'
require 'open-uri'
require 'hpricot'

namespace :import do
  
  task :get_events do
    #get xml file
    begin
      
      MAX_PAGES = 2 #set the max page depth for scraping
      
      for i in 1..MAX_PAGES do
        path = "http://www.wegottickets.com/searchresults/page/#{i}/all"
        
        open(path, "User-Agent" => "Ruby/#{RUBY_VERSION}",
            "From" => "email@addr.com",
            "Referer" => "http://www.igvita.com/blog/") { |f|
            puts "Fetched document: #{f.base_uri}"
            puts "\\t Content Type: #{f.content_type}\\n"
            puts "\\t Charset: #{f.charset}\\n"
            puts "\\t Content-Encoding: #{f.content_encoding}\\n"
            puts "\\t Last Modified: #{f.last_modified}\\n\\n"
            # Save the response body
            @response = f.read
        }
        doc = Hpricot(@response)
        #cycle though dates per page
        for date_num in 3..12 do
          xpath_to_title = "/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/h3/a"
          xpath_to_date ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/p/span"
          xpath_date ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div[2]/blockquote/"
          xpath_to_price ="/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div/span/strong"
          xpath_to_available = "/html/body/div/div/div[3]/div[3]/div/div[#{date_num}]/div/div/span[2]"
          artist_name = (doc/xpath_to_title).inner_html
          gig_location = (doc/xpath_to_date).inner_html
          get_date = (doc/xpath_date).inner_html
          gig_date = get_date.split("
")[1] #remove pound if needed #gig_price = (doc/xpath_to_price).inner_html gig_price = (doc/xpath_to_price).inner_html.delete "£" gig_available = (doc/xpath_to_available).inner_html if gig_available.size < 2 then gig_available = "Tickets are available" end #here we can insert the fields as artist_name, gig_location, and gig_date # as well as gig_price and gig_available puts artist_name puts gig_location puts gig_date puts gig_price puts gig_available puts "\n\n" end end rescue => ex puts ex.message end end end
Recommend Me




My Site Links

Screenshots are featured above. If you visit gmgpulse, you may login as demo/demo.

Rockstar Television


© 2008-10 Krister Axel and codeboxer.com