Class: Wmap::GoogleSearchScraper

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/wmap/google_search_scraper.rb

Overview

We build our own Google search class by querying Google search engine from its web interface, by simulating

an anonymous web surfer. Note: we don’t use the native Google API due to its pricing structure - We don’t have budget for

this project, and we can not use the free version due to the limitation of 100 queries per day for free. See https://github.com/google/google-api-ruby-client for details.

Constant Summary collapse

File_locator =

Google search engine web interface locators

File.dirname(__FILE__)+'/../../settings/google_locator.txt'
File_keywords =

Google search key words

File.dirname(__FILE__)+'/../../settings/google_keywords.txt'

Constants included from Utils::UrlMagic

Utils::UrlMagic::Max_http_timeout, Utils::UrlMagic::User_agent

Constants included from Utils::DomainRoot

Utils::DomainRoot::File_ccsld, Utils::DomainRoot::File_cctld, Utils::DomainRoot::File_gtld, Utils::DomainRoot::File_tld

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utils

#cidr_2_ips, #file_2_hash, #file_2_list, #get_nameserver, #get_nameservers, #host_2_ip, #host_2_ips, #is_cidr?, #is_fqdn?, #is_ip?, #list_2_file, #reverse_dns_lookup, #sort_ips, #valid_dns_record?, #zone_transferable?

Methods included from Utils::Logger

#wlog

Methods included from Utils::UrlMagic

#create_absolute_url_from_base, #create_absolute_url_from_context, #host_2_url, #is_site?, #is_ssl?, #is_url?, #landing_location, #make_absolute, #normalize_url, #open_page, #redirect_location, #response_code, #response_headers, #url_2_host, #url_2_path, #url_2_port, #url_2_site, #urls_on_same_domain?

Methods included from Utils::DomainRoot

#get_domain_root, #get_domain_root_by_ccsld, #get_domain_root_by_cctld, #get_domain_root_by_tlds, #get_sub_domain, #is_domain_root?, #print_ccsld, #print_cctld, #print_gtld

Constructor Details

#initialize(params = {}) ⇒ GoogleSearchScraper

Scraper default variables



29
30
31
32
33
34
35
# File 'lib/wmap/google_search_scraper.rb', line 29

def initialize (params = {})		
	@verbose=params.fetch(:verbose, false)
	@http_timeout=params.fetch(:http_timeout, 5000)
	# Discovered data store		
	@discovered_urls_from_scraper=Hash.new
	@discovered_sites_from_scraper=Hash.new
end

Instance Attribute Details

#discovered_sites_from_scraperObject (readonly)

Returns the value of attribute discovered_sites_from_scraper.



20
21
22
# File 'lib/wmap/google_search_scraper.rb', line 20

def discovered_sites_from_scraper
  @discovered_sites_from_scraper
end

#discovered_urls_from_scraperObject (readonly)

Returns the value of attribute discovered_urls_from_scraper.



20
21
22
# File 'lib/wmap/google_search_scraper.rb', line 20

def discovered_urls_from_scraper
  @discovered_urls_from_scraper
end

#http_timeoutObject

Returns the value of attribute http_timeout.



19
20
21
# File 'lib/wmap/google_search_scraper.rb', line 19

def http_timeout
  @http_timeout
end

#keyword_listObject

Returns the value of attribute keyword_list.



19
20
21
# File 'lib/wmap/google_search_scraper.rb', line 19

def keyword_list
  @keyword_list
end

#verboseObject

Returns the value of attribute verbose.



19
20
21
# File 'lib/wmap/google_search_scraper.rb', line 19

def verbose
  @verbose
end

Instance Method Details

Search for nodes by css, and extract the hyper links



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/wmap/google_search_scraper.rb', line 86

def extract_links (doc)
	begin
		puts "Extract the meaningful links from the DOC." if @verbose
		links=Array.new
		doc.css('a').each do |link|
			ref=link.attribute('href').to_s
			if ref =~ /\/url\?/
				my_key=ref.sub(/\/url\?q\=/,'')
				my_site=url_2_site(my_key)
				links.push(my_key)
				@discovered_urls_from_scraper[my_key]=true unless @discovered_urls_from_scraper.key?(my_key)
				@discovered_sites_from_scraper[my_site]=true unless @discovered_sites_from_scraper.key?(my_site)
			end
		end
		return links
	rescue Exception => ee
		puts "Exception on method extract_links: #{ee}" if @verbose
		return nil
	end 
end

#get_discovered_sites_from_scraperObject Also known as: print

‘getter’ for the discovered sites from the Google search



138
139
140
141
142
143
144
145
# File 'lib/wmap/google_search_scraper.rb', line 138

def get_discovered_sites_from_scraper
	puts "Getter for the discovered sites by the scraper. " if @verbose
	begin
		return @discovered_sites_from_scraper.keys.sort
       rescue => ee
		puts "Error on method get_discovered_sites_from_scraper: #{ee}" if @verbose
       end
end

#get_discovered_urls_from_scraperObject

‘getter’ for the discovered urls from the Google search



149
150
151
152
153
154
155
156
# File 'lib/wmap/google_search_scraper.rb', line 149

def get_discovered_urls_from_scraper
	puts "Getter for the discovered urls by the scraper. " if @verbose
	begin
		return @discovered_urls_from_scraper.keys.sort
       rescue => ee
		puts "Error on method get_discovered_urls_from_scraper: #{ee}" if @verbose
       end
end

#google_search(locator, keyword) ⇒ Object

Perform a Google web interface keyword search, return as a Nokogiri::HTML:Document object for the search result page



74
75
76
77
78
79
80
81
82
83
# File 'lib/wmap/google_search_scraper.rb', line 74

def google_search (locator,keyword)
	begin
		puts "Perform the keyword search on the Google web engine for: #{keyword}" if @verbose
		link_search = locator + "search?q=" + URI::encode(keyword)
		doc = Nokogiri::HTML(open(link_search))
		return doc
	rescue Exception => ee
		puts "Exception on method google_search at Google engine location #{link_search} for the keyword #{keyword} : #{ee}" if @verbose
	end
end

#google_worker(keyword) ⇒ Object Also known as: worker, search

Main worker method to simulate extensive google keyword searches on over 100+ countries and regions. The search will extract known web services related to the keyword by the Google Inc.



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/wmap/google_search_scraper.rb', line 38

def google_worker (keyword)
	begin
		puts "Start the Google worker for: #{keyword}" if @verbose
		links=Array.new
		keyword=keyword.strip
		google_locators = file_2_list(File_locator)
		google_locators.map do |locator|
			doc=google_search(locator,keyword) unless keyword.nil?
			links+=extract_links(doc) unless doc.nil? 
		end
		return links.uniq.sort-["",nil]
	rescue Exception => ee
		puts "Exception on the method google_worker for #{keyword}: #{ee}" if @verbose
		return nil
	end	
end

#google_workers(keyword_list = file_2_list(File_keywords)) ⇒ Object Also known as: workers

Main method to collect intelligences on the Google vast data warehouse. It works by hitting the Google engines with the keyword list. This exhausive method will sweep through the Google engines in over 100+ countries and regions one by one, in order to collect all related web service links collected by known the Google, Inc. across the global Internet.



58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/wmap/google_search_scraper.rb', line 58

def google_workers(keyword_list=file_2_list(File_keywords)) 
	begin
		puts "Start the Google worker for: #{keyword_list}" if @verbose
		links=Array.new			
		keyword_list.map do |keyword|
			links+=google_worker(keyword)
		end
		return links.uniq.sort
	rescue Exception => ee
		puts "Exception on the method google_workers for #{keyword_list}: #{ee}" if @verbose
		return nil
	end	
end

Method to print out discovery Sites result



123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/wmap/google_search_scraper.rb', line 123

def print_discovered_sites_from_scraper		
	puts "Print discovered sites by the scraper. " if @verbose
	begin
		puts "\nSummary Report of Discovered Sites from the Scraper:"
		@discovered_sites_from_scraper.keys.each do |site|
			puts site
		end
		puts "Total: #{@discovered_sites_from_scraper.keys.size} site(s)"
		puts "End of the summary"
       rescue => ee
		puts "Error on method print_discovered_sites_from_scraper: #{ee}" if @verbose
       end
end

Method to print out discovery URL result



108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/wmap/google_search_scraper.rb', line 108

def print_discovered_urls_from_scraper		
	puts "Print discovered urls by the scraper. " if @verbose
	begin
		puts "\nSummary Report of Discovered URLs from the Scraper:"
		@discovered_urls_from_scraper.keys.each do |url|
			puts url
		end
		puts "Total: #{@discovered_urls_from_scraper.keys.size} url(s)"
		puts "End of the summary"
       rescue => ee
		puts "Error on method print_discovered_urls_from_scraper: #{ee}" if @verbose
       end
end

#save_discovered_sites_from_scraper(file) ⇒ Object Also known as: save

Save the discovered sites into a local file



159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/wmap/google_search_scraper.rb', line 159

def save_discovered_sites_from_scraper (file)
	puts "Save the discovery result(sites) into a local file: #{file}" if @verbose
	begin
		f=File.open(file, 'w')
		timestamp=Time.now
		f.puts "# Discovery result written by Wmap::GoogleSearchScraper.save_discovered_sites_from_scraper method at #{timestamp}\n"
		@discovered_sites_from_scraper.keys.sort.map { |x| f.puts "#{x}\n" }
		f.close
		raise "Unknown problem saving the result to file: #{file}" unless File.exist?(file)
		puts "Done saving the discovery result into the local file: #{file}" 
       rescue => ee
		puts "Error on method save_discovered_sites_from_scraper: #{ee}" if @verbose
       end
end