Class: LinkScrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/link_scrapper.rb

Overview

class for grabbing and parsing domain links

Instance Method Summary collapse

Constructor Details

#initialize(search_domain = SEARCH_DOMAIN) ⇒ LinkScrapper

Returns a new instance of LinkScrapper.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/link_scrapper.rb', line 10

def initialize(search_domain = SEARCH_DOMAIN)

	# init link store hashes
	@search_index = 0
	@search_iteration = 0
	@links = Array.new
	@checked_links = Hash.new
	@error_links = Hash.new
	@external_links = Hash.new

	# gather search domain
	if ARGV[0]
		@search_domain = ARGV[0].dup
	elsif search_domain
		@search_domain = search_domain
	elsif search_domain == 'ue'
		puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
		@search_domain = gets.chomp
	end

	# override with default domain if entry is left empty
	@search_domain = SEARCH_DOMAIN if @search_domain == ''

	# get and store local domain string
	@local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)

	# configure initial search uri
	@search_uri = @search_domain

	# verify domain entry includes protocol
	if @search_uri !~ /^htt(p|ps):/
		@search_uri.insert(0, 'http://')
	end

	# verify leading forward slash
	if @search_uri[@search_uri.length-1] != '/'
		@search_uri << '/'
	end

	# start scan
	get_links
end

Instance Method Details

gather link data



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/link_scrapper.rb', line 129

def get_links

	# init skip bit
	@skip = 0

	# define search uri if undefined
	get_search_uri

	# check for existing uri hash index
	if @checked_links[@search_uri.to_sym]
		@skip = 1
	end

	# run link scan if @skip bit is not set
	if @skip == 0

		# let user know which uri is currently active
		puts @search_uri

		# gather page request response
		begin
			t1 = Time.now
			response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
			t2 = Time.now
			delta = t2 - t1

			# store response page body
			body = response.body

			# store response code
			code = response.code

			# extract all links within page
			links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)

			# update anchors and indirect links to use direct links
			links_array.each { |val|
				if val[0] != '/' || val !~ /^htt(p|ps):/ || val[0,2] != '//'
					val = "#{@search_uri}#{val}"
				end
			}

			# combine found links with links array
			@links.concat(links_array)

			# remove duplicates
			@links.uniq!

		rescue => ex
			rescode = 408
		end

		# store results in checked hash
		@checked_links[@search_uri.to_sym] = {res: code, time: delta}

	end

	# iterate through found links
	@search_iteration += 1
	get_links

end

#get_search_uriObject

gather search uri



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/link_scrapper.rb', line 54

def get_search_uri
	# do not override initial domain setting
	if @search_iteration > 0
		# set search uri
		if !@links[@search_index].nil?
			@search_uri = @links[@search_index][0].chomp
		else
			# save results and exit
			save_results
			exit
		end

		# check for direct link
		if @search_uri =~ /^htt(p|ps):/
			# if external link go to next link
			if @search_uri.index(@local_domain[0]) == nil
				if !@external_links[@search_uri.to_sym]
					begin
						t1 = Time.now
						response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
						t2 = Time.now
						delta = t2 - t1
						rescode = response.code
					rescue => ex
						rescode = 408
					end
					@external_links[@search_uri.to_sym] = {res: rescode, time: delta}
				end
				@skip = 1
			end
		else

			# skip various files
			if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
				@skip = 1
			end

			# check for mailto link
			if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
				@skip = 1
			else
				# check for protocol agnostic and indirect links
				if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
					@search_uri[0,2] = ""
				end
				# check for relative link
				if @search_uri[0] == '/'
					@search_uri[0] = ''
				end
				# verify uri portion is valid
				if @search_uri !~ /^([\w]|%|#|\?)/
					@search_index += 1
					@skip = 1
					puts "invalid uri #{@search_uri}"
					return
				end
				# define uri string
				if @search_uri[0,2] != '//'
					@search_uri = "#{@search_domain}#{@search_uri}"
				else
					# handle protocol agnostic link requests
					if @search_domain[0,6] == 'https:'
						@search_uri = "https:#{@search_uri}"
					else
						@search_uri = "http:#{@search_uri}"
					end
				end
			end
		end
		# increment search index value
		@search_index += 1
	end
end

#save_resultsObject

save results to csvs



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/link_scrapper.rb', line 193

def save_results
	# save search results
	CSV.open('results.csv', 'wb') {|csv|
		@checked_links.each {|key|
			csv << [key[0], key[1][:res], key[1][:time]]
		}
	}

	# save list of external links
	CSV.open('external-links.csv', 'wb') {|csv|
		@external_links.each do |key|
		   csv << [key[0], key[1][:res], key[1][:time]]
		end
	}
end