Class: Apollo::Crawler::BaseCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/apollo_crawler/crawler/base_crawler.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeBaseCrawler

Returns a new instance of BaseCrawler.



32
33
34
35
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 32

def initialize
	@backlog = []
	@visited = []
end

Class Method Details

.create_metadoc(url, doc) ⇒ Object



172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 172

def self.create_metadoc(url, doc)
	body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'})
	
	return {
		'url' => url,
		'doc' => body,
		'hash' => Digest::SHA256.new.update(body).hexdigest,
		'created_at' => Time.now.utc,
		'expires_at' => nil,
		'version' => 0
	}
end

.fetch(url) ⇒ Object



50
51
52
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 50

def self.fetch(url)
	RbConfig::DEFAULT_FETCHER.fetch(url)
end

.name_reObject



37
38
39
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 37

def self.name_re()
	return /crawler$/
end

.try_get_doc(root, url) ⇒ Object



62
63
64
65
66
67
68
69
70
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 62

def self.try_get_doc(root, url)
	doc = BaseCrawler.try_get_url(root, url)
	
	# TODO: Set experition header
	return {
		:doc => doc,
		:url => url
	}
end

.try_get_url(root, url) ⇒ Object



54
55
56
57
58
59
60
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 54

def self.try_get_url(root, url)
	begin
		return URI.join(root, url)
	rescue
		return nil
	end
end

Instance Method Details

#enqueue_url(url) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 130

def enqueue_url(url)
	urls = []
	return urls if url.nil?
	# We support both - list of urls or single url
	if(url.kind_of?(Array))
		urls = urls.concat(url)
	else
		urls << url
	end

	urls.each do |u|
		if(url_processed?(u) == false)
			@backlog << u
		end
	end
end

#etl(url = nil, opts = {}, &block) ⇒ Object

  • (0) Figure out URL

  • (1) Extract Data

  • (2) Extract Links

  • (3) Go to (0) eventually



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 76

def etl(url=nil, opts={}, &block)
	# Look for passed URL use default instead and fail if it is not valid
	if(url.nil? || url.empty?)
		url = self.url
	end

	# TODO: Be more agressive, use assert, it is clients responsibility!
	if(url.nil?)
		return nil
	end

	enqueue_url(url)

	# Counter of processed documents (pages)
	docs_processed = 0

	res = []
	# TODO: Respect limit of documents/urls processed
	while(@backlog.empty? == false)
		url = @backlog.shift

		# puts "Processing '#{url}'"
		doc = self.process_url(url)
		
		# Increase counter of processed documents
		docs_processed = docs_processed + 1

		@visited << url

		# Process document if was successfuly retreived
		if(!doc.nil?)
			# TODO: Use log4r and log it only on info level
			if block_given?
				yield doc
			end

			# Add document to queue of results
			res << doc

			enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links]
		end

		# Break if limit of documents to processed was reached
		break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
	end

	# Return processed document
	return res
end

#extract_data(doc) ⇒ Object

Extracts data from document



228
229
230
231
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 228

def extract_data(doc)
	res = []
	return res
end

Extract links to another documents from this document



234
235
236
237
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 234

def extract_links(doc)
	res = []
	return res
end

#fetch_document(url) ⇒ Object

Fetch document



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 186

def fetch_document(url)
	# TODO: Refactor following idiom
	if(url == nil)
		url = self.url
	end

	if(url.nil?)
		return nil
	end

	url = url.to_s

	# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
	cache = Apollo::Cache::Factory.instance.construct
	metadoc = cache.try_get(url) do
		max_attempts = 3
		attempt_no = 0
		success = false
		
		doc = nil
		while(attempt_no < max_attempts && success == false) do
			begin
				doc = BaseCrawler.fetch(url)
				success = true
			rescue Exception => e
				puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
				sleep 1

				attempt_no = attempt_no + 1
				success = false
			end
		end

		# Create metadata
		BaseCrawler.create_metadoc(url, doc)
	end

	# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
	return Nokogiri::HTML(metadoc['doc'])
end

#nameObject

Name of the crawler



42
43
44
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 42

def name
	return "Crawler Base" 
end

#process_url(url) ⇒ Object



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 147

def process_url(url)
	doc = self.fetch_document(url)
	if(doc.nil?)
		return nil
	end

	# Try extract data from document
	data = self.extract_data(doc)

	# Try extract links for another documents 
	links = self.extract_links(doc)
	
	# TODO: Make configurable if links extracted from doc should be printed
	# puts links.inspect

	# Format ETL result
	res = { 
		:crawler => self.class.name,
		:data => data,
		:links => links
	}

	return res
end

#urlObject



46
47
48
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 46

def url
	return nil
end

#url_processed?(url) ⇒ Boolean

Returns:

  • (Boolean)


126
127
128
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 126

def url_processed?(url)
	return @backlog.include?(url) || @visited.include?(url)
end