Class: Wmap::UrlCrawler

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/wmap/url_crawler.rb,
lib/wmap/url_crawler/adware_tag.rb

Overview

Web site crawler class

Direct Known Subclasses

AdwareTag

Defined Under Namespace

Classes: AdwareTag

Constant Summary collapse

Max_http_timeout =

set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain ‘weird’ site(s)

8000
Crawl_timeout =

set hard stop limit of crawler time-out to 1200 seconds or 20 minutes

1200000

Constants included from Wmap::Utils::UrlMagic

Wmap::Utils::UrlMagic::User_agent

Constants included from Wmap::Utils::DomainRoot

Wmap::Utils::DomainRoot::File_ccsld, Wmap::Utils::DomainRoot::File_cctld, Wmap::Utils::DomainRoot::File_gtld, Wmap::Utils::DomainRoot::File_tld

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utils

#cidr_2_ips, #file_2_hash, #file_2_list, #get_nameserver, #get_nameservers, #host_2_ip, #host_2_ips, #is_cidr?, #is_fqdn?, #is_ip?, #list_2_file, #reverse_dns_lookup, #sort_ips, #valid_dns_record?, #zone_transferable?

Methods included from Wmap::Utils::Logger

#wlog

Methods included from Wmap::Utils::UrlMagic

#create_absolute_url_from_base, #create_absolute_url_from_context, #host_2_url, #is_site?, #is_ssl?, #is_url?, #landing_location, #make_absolute, #normalize_url, #open_page, #redirect_location, #response_code, #response_headers, #url_2_host, #url_2_path, #url_2_port, #url_2_site, #urls_on_same_domain?

Methods included from Wmap::Utils::DomainRoot

#get_domain_root, #get_domain_root_by_ccsld, #get_domain_root_by_cctld, #get_domain_root_by_tlds, #get_sub_domain, #is_domain_root?, #print_ccsld, #print_cctld, #print_gtld

Constructor Details

#initialize(params = {}) ⇒ UrlCrawler

Crawler instance default variables



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/wmap/url_crawler.rb', line 32

def initialize (params = {})
	@verbose=params.fetch(:verbose, false)
	@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
	@http_timeout=params.fetch(:http_timeout, 5000)
	@crawl_depth=params.fetch(:crawl_depth, 4)
	@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
	@max_parallel=params.fetch(:max_parallel, 40)
	@user_agent=params.fetch(:user_agent, "OWASP WMAP Spider")
	# Discovered data store
	@discovered_urls_by_crawler=Hash.new
	@visited_urls_by_crawler=Hash.new
	@crawl_start=Hash.new
	@crawl_done=Hash.new
	Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
	@log_dir=@data_dir + "/../logs/"
	Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir)
	@log_file=@log_dir + "crawler.log"
end

Instance Attribute Details

#crawl_depthObject

Returns the value of attribute crawl_depth.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def crawl_depth
  @crawl_depth
end

#crawl_doneObject (readonly)

Returns the value of attribute crawl_done.



22
23
24
# File 'lib/wmap/url_crawler.rb', line 22

def crawl_done
  @crawl_done
end

#crawl_page_limitObject

Returns the value of attribute crawl_page_limit.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def crawl_page_limit
  @crawl_page_limit
end

#crawl_startObject (readonly)

Returns the value of attribute crawl_start.



22
23
24
# File 'lib/wmap/url_crawler.rb', line 22

def crawl_start
  @crawl_start
end

#data_dirObject

Returns the value of attribute data_dir.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def data_dir
  @data_dir
end

#discovered_urls_by_crawlerObject (readonly)

Returns the value of attribute discovered_urls_by_crawler.



22
23
24
# File 'lib/wmap/url_crawler.rb', line 22

def discovered_urls_by_crawler
  @discovered_urls_by_crawler
end

#http_timeoutObject

Returns the value of attribute http_timeout.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def http_timeout
  @http_timeout
end

#max_parallelObject

Returns the value of attribute max_parallel.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def max_parallel
  @max_parallel
end

#signature_fileObject

Returns the value of attribute signature_file.



15
16
17
# File 'lib/wmap/url_crawler/adware_tag.rb', line 15

def signature_file
  @signature_file
end

#tag_fileObject

Returns the value of attribute tag_file.



15
16
17
# File 'lib/wmap/url_crawler/adware_tag.rb', line 15

def tag_file
  @tag_file
end

#tag_signaturesObject (readonly)

Returns the value of attribute tag_signatures.



16
17
18
# File 'lib/wmap/url_crawler/adware_tag.rb', line 16

def tag_signatures
  @tag_signatures
end

#tag_storeObject (readonly)

Returns the value of attribute tag_store.



16
17
18
# File 'lib/wmap/url_crawler/adware_tag.rb', line 16

def tag_store
  @tag_store
end

#user_agentObject

Returns the value of attribute user_agent.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def user_agent
  @user_agent
end

#verboseObject

Returns the value of attribute verbose.



20
21
22
# File 'lib/wmap/url_crawler.rb', line 20

def verbose
  @verbose
end

#visited_urls_by_crawlerObject (readonly)

Returns the value of attribute visited_urls_by_crawler.



22
23
24
# File 'lib/wmap/url_crawler.rb', line 22

def visited_urls_by_crawler
  @visited_urls_by_crawler
end

Instance Method Details

#crawl(url) ⇒ Object Also known as: query

by crawling ‘www.yahoo.com/’ it could discover ‘login.yahoo.com/’



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/wmap/url_crawler.rb', line 72

def crawl(url)
	puts "Start web crawling on #{url}"
	result=Array.new
	url=url.chomp.strip
	result.push(url_2_site(url))
	raise "Error! Invalid url format: #{urls}" unless is_url?(url)
	# Add logic to profile the web server before crawling; this is used to optimize the crawling speed
	pre_crawl(url)
	status = Timeout::timeout(Crawl_timeout/1000) {
		result+=crawl_worker(url).keys
	}
	puts "Web crawling time-out on #{url}: #{status}" if @verbose
	return result
rescue => ee
	puts "Exception on method #{__method__} for URL #{url}: #{ee}"
	return result
end

#crawl_worker(url0) ⇒ Object

The worker instance of crawler who perform the labour work



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/wmap/url_crawler.rb', line 92

def crawl_worker(url0)
	puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
	# Input URL sanity check first
	if is_url?(url0)
		host=url_2_host(url0)
		ip=host_2_ip(host).to_s
		raise "Invalid IP address: #{url0}" if ip.nil?
		port=url_2_port(url0).to_s
		raise "Invalid port number: #{url0}" if port.nil?
	else
		raise "Invalid URL: #{url0}. Please check it out with your browser again."
	end
	log_info=Hash.new
	log_info[1]="Start working on #{url0}"
	url_stores=Hash.new
	url_stores[url0]=true unless url_stores.key?(url0)
	@discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
	@crawl_start[url0]=true unless @crawl_start.key?(url0)
#			$discovered_urls[url0]=true unless $discovered_urls.key?(url0)
	@crawl_depth.times do
		url_stores.keys.each do |url|
			# 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
			next if @visited_urls_by_crawler.key?(url)
			url_object = open_url(url)
			next if url_object == nil
			url = update_url_if_redirected(url, url_object)
			url_body = read_url(url)
			# Protection code - to avoid parsing failure on the empty or nil object
			next if url_body.nil? or url_body.empty?
			url_stores[url]=true unless url_stores.key?(url)
			@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
#					$discovered_urls[url]=true unless $discovered_urls.key?(url)
			doc = Nokogiri::HTML(url_body)
			next if doc == nil
			if url_stores.size >= @crawl_page_limit
				#@visited_urls_by_crawler.merge!(url_stores)
				@discovered_urls_by_crawler.merge!(url_stores)
#						$discovered_urls.merge!(url_stores)
				puts "Finish web crawling the url: #{url0}"
				return url_stores
			end
			page_urls = find_urls_on_page(doc, url)
			page_urls.uniq!
			page_urls.map do |y|
				y=normalize_url(y)
				url_stores[y]=true unless url_stores.key?(y)
				@discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
#						$discovered_urls[y]=true unless $discovered_urls.key?(y)
			end
		end
	end
	puts "Finish web crawling on: #{url0}"
	log_info[2]="Finish working on: #{url0}"
	wlog(log_info, "UrlCrawler", @log_file)
	@crawl_done[url0]=true unless @crawl_done.key?(url0)
	return url_stores
rescue => ee
	puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
	log_info[3]="Exception on #{url0}"
	wlog(log_info,"UrlCrawler",@log_file)
	return url_stores
end

#crawl_workers(targets, num = @max_parallel) ⇒ Object Also known as: crawls

Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time each child process will continuously work on the target pool until all the works are done



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/wmap/url_crawler.rb', line 157

def crawl_workers (targets,num=@max_parallel)
	raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
	puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
	#puts "This could be awhile depending on the list size. Please be patient ..."
	# 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
	targets -= ["", nil]
	uniq_sites=Hash.new
	targets.dup.map do |target|
		if is_url?(target)
			host=url_2_host(target)
			ip=host_2_ip(host).to_s
			next if ip.nil?
			port=url_2_port(target).to_s
			next if port.nil?
			site_key=ip+":"+port
			unless uniq_sites.key?(site_key)
				uniq_sites[site_key]=target
			end
		end
	end
	puts "Sanitization done! " if @verbose
	puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
	puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
	raise "Error: target list is empty!" if targets.size < 1
	Parallel.map(uniq_sites.values, :in_processes => num) { |target|
		puts "Working on #{target} ..." if @verbose
		crawl(target)
	}.dup.each do |process|
		puts "process.inspect: #{process}" if @verbose
		urls=process
		urls-=["",nil] unless urls.nil?
		if urls.nil?
			next
		elsif urls.empty?
			next
			#do nothing
		else
			urls.map do |url|
				url.strip!
				@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
				#$discovered_urls[url]=true unless $discovered_urls.key?(url)
			end
		end
	end
	#return sites
	return @discovered_urls_by_crawler.keys
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#crawl_workers_on_file(file) ⇒ Object Also known as: query_file, crawl_file

Fast crawling method - build the target pool from the input file



210
211
212
213
214
215
216
217
218
# File 'lib/wmap/url_crawler.rb', line 210

def crawl_workers_on_file (file)
	puts "Web crawl the list of targets from file: #{file}"
	targets=file_2_list(file)
	sites=crawl_workers(targets,num=@max_parallel)
	return sites
rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#get_discovered_sites_by_crawlerObject Also known as: get_sites

Method to retrieve discovery site result



328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# File 'lib/wmap/url_crawler.rb', line 328

def get_discovered_sites_by_crawler
	puts "Print summary report of discovered sites. " if @verbose
	puts "\nSummary Report of Discovered Sites from the Crawler:"
	sites = Hash.new
	@discovered_urls_by_crawler.keys.each do |url|
		site=url_2_site(url)
		sites[site]=true unless sites.key?(site)
	end
	sites.keys.map { |site| puts site }
	puts "Total: #{sites.size}"
	puts "End of the summary"
	return sites.keys
 rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#pre_crawl(url) ⇒ Object

Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/wmap/url_crawler.rb', line 52

def pre_crawl(url)
	begin
		puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
		host=url_2_host(url)
		# Use the following formula to 'guess' the right http time-out threshold for the scanner
		nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
		if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
			@http_timeout = Max_http_timeout
		else
			@http_timeout = 1500 + nwk_to*2
		end
		puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
	rescue Exception => ee
		puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
		@http_timeout = Max_http_timeout
	end
end

Method to print out discovery URL result



302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/wmap/url_crawler.rb', line 302

def print_discovered_urls_by_crawler
	puts "Print discovered url by the crawler. " if @verbose
	puts "\nSummary Report of Discovered URLs from the Crawler:"
	@discovered_urls_by_crawler.keys.each do |url|
		puts url
	end
	puts "Total: #{@discovered_urls_by_crawler.keys.size}"
	puts "End of the summary"
 rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end

#save_discovered_urls(file) ⇒ Object Also known as: save

Method to save URL discovery result



317
318
319
320
321
322
323
324
# File 'lib/wmap/url_crawler.rb', line 317

def save_discovered_urls (file)
	puts "Save discovered urls by the crawler to file: #{file} "
	list_2_file(@discovered_urls_by_crawler.keys, file)
	puts "Done!"
 rescue => ee
   puts "Exception on method #{__method__}: #{ee}" if @verbose
   return nil
end