# = Author # Stephen Kapp # # = Copyright # Copyright (c) Stephen Kapp 2008 # # require 'open-uri' require 'hpricot' # Extension to Hpricot to clean up the HTML a little module Hpricot # :nodoc: class Elements # :nodoc: def strip each { |x| x.strip } end end class Elem # :nodoc: def strip parent.replace_child self, Hpricot.make(inner_html) unless parent.nil? end end end # =Name # # Netcraft::Query # # =Synopsis # # require "netcraft" # # =Description # # Netcraft::Query is the class to perform a Netscraft site report on a # target domain. # # =Usage # # a = Netcraft::Query.new("www.virus.org") # # b = a.query # # p b[:site] # => {:nameserver=>"ns1.mydyndns.org", :first_seen=>"June 1997", :site=>"http://www.virus.org", # :dns_admin=>"zone-admin@dyndns.com", :organisation=>"Virus.Org, United Kingdom", # :reverse_dns=>"sugargrove.virus.org", :last_reboot=>"51 days ago", # :ns_organisation=>"Dynamic Network Services, Inc., 1230 Elm St., Manchester, 03101, United States", :registry=>"pir.org", # :netblock=>"Hetzner Online AG", :ip_address=>"213.239.218.185", :domain=>"virus.org", :site_rank=>"238743", :country=>"DE"} # module Netcraft class Query $NETCRAFT = 'http://toolbar.netcraft.com/site_report?url=http://' $USERAGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)' # Create a new object # # Argument +site+ contains the site to use in the netcraft query # def initialize(site) raise NetcraftArgumentError, "Argument has to be a string" unless site.kind_of? String @config[:site] = site end # Perform Netcraft query and return result # # Returns hash of an array with the history of the site and a has of the site report. # def query url = $NETCRAFT + @config[:site} res = open( url , "User-Agent" => $USERAGENT) doc = Hpricot(res) # Clean up some unwatched HTML elements to make handling of the content # easier later on doc.search("img").remove doc.search("a").strip doc.search("b").strip # Find the Site Report element of the Netcraft report site = doc.search("//div[@id=content_col]/table[@class=TBtable]").first # Extract the information out of the Site Report rows = [] site.search("//tr").each do |row| cells = [] (row/"td").each do |cell| cells << cell.inner_text end rows << cells end # Extract the Specific elements from the site report and place into the site # report hash site = Hash.new rows.each do |row| case row[0].downcase when "site" site[:site] = row[1] when "domain" site[:domain] = row[1] when "ip address" site[:ip_address] = row[1] when "country" site[:country] = row[1].gsub(/\?/, '') when "date first seen" site[:first_seen] = row[1] when "domain registry" site[:registry] = row[1] when "organisation" site[:organisation] = row[1] end case row[2].downcase when "last reboot" tmp = row[3].gsub(/\?/, '') site[:last_reboot] = tmp.gsub(/Uptime graph/,'') when "netblock owner" site[:netblock] = row[3] when "site rank" site[:site_rank] = row[3] when "nameserver" site[:nameserver] = row[3] when "dns admin" site[:dns_admin] = row[3] when "reverse dns" site[:reverse_dns] = row[3] when "nameserver organisation" site[:ns_organisation] = row[3] end end # Locate the Site History element of the Netcraft report history = doc.search("//div[@id=content_col]/table[@class=TBtable]").last past = [] # Extract the History from the report history.search("//tr").each do |row| cells = [] (row/"td").each do |cell| cells << cell.inner_text if cell.inner_text.length > 0 end past << cells end # Clean a dead row from the headers past.shift # Return the result netcraft = { :history => past, :site => site } end # Return a string with the default domain # def site @config[:site].inspect end # Set the domain for the query # def site=(name) @config[:site] = name if valid? name end end class NetcraftArgumentError < ArgumentError # :nodoc: end end