Class: Apollo::CrawlerProgram

Inherits:
BaseProgram show all
Defined in:
lib/apollo_crawler/program/crawler_program.rb

Constant Summary

Constants inherited from BaseProgram

BaseProgram::CONFIG_DIR, BaseProgram::DEFAULT_OPTIONS

Instance Attribute Summary

Attributes inherited from BaseProgram

#amqp, #config, #mongo, #options, #optparser

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from BaseProgram

get_config_path, #init_amqp, #init_mongo, #init_seeds, #init_seeds_crawlers, #load_config, #load_configs, require_files

Constructor Details

#initializeCrawlerProgram

Initializer - Constructor



70
71
72
73
74
75
76
77
78
# File 'lib/apollo_crawler/program/crawler_program.rb', line 70

def initialize
	super
	
	@options = {}

	at_exit { 
		at_exit_handler
	}
end

Class Method Details

.console_table(headings, rows) ⇒ Object

Show tabular data in form of CLI table



98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/apollo_crawler/program/crawler_program.rb', line 98

def self.console_table(headings, rows)
	rows = rows.map do |o| 
		i = o.new

		res = []
		headings.each do |h|
			res << i.instance_eval(h)
		end
		res
	end

	table = Terminal::Table.new :headings => headings, :rows => rows
	puts table
end

.get_modules_paths(modules = APOLLO_CRAWLER_MODULES) ⇒ Object



80
81
82
83
84
85
86
87
88
# File 'lib/apollo_crawler/program/crawler_program.rb', line 80

def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
	res = modules.map do |name|
		Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path|
			path
		end
	end

	res.flatten.sort
end

.register_modules(modules = APOLLO_CRAWLER_MODULES) ⇒ Object



90
91
92
93
94
95
# File 'lib/apollo_crawler/program/crawler_program.rb', line 90

def self.register_modules(modules = APOLLO_CRAWLER_MODULES)
	get_modules_paths(modules).each do |file| 
		# puts "Adding module '#{file}'"
		require file
	end
end

Instance Method Details

#at_exit_handlerObject

At Exit handler



511
512
513
514
515
516
517
518
519
520
521
# File 'lib/apollo_crawler/program/crawler_program.rb', line 511

def at_exit_handler()
	# if(@options[:verbose])
	# 	puts "Running at_exit_handler" 
	# end

	# TODO: Flush caches
	# TODO: End gracefully

	# Force exit event machine
	# EventMachine.stop
end

#generate_crawler(name, url = nil, matcher = nil, options = @options) ⇒ Object



265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/apollo_crawler/program/crawler_program.rb', line 265

def generate_crawler(name, url = nil, matcher = nil, options = @options)
	name = name.titleize.gsub(" ", "")

	if(@options[:verbose])
		puts "Generating new crawler '#{name}'"
	end

	template_path = RbConfig::CRAWLER_TEMPLATE_PATH
	puts template_path
	if(File.exists?(template_path) == false)
		puts "Template file '#{template_path}' does not exists!"
		return -1
	end

	if(options[:verbose])
		puts "Using template '#{template_path}'"
	end

	unless(options[:silent])
		dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
	end

	url = url ? url : "http://some-url-here"
	matcher = matcher ? matcher : "//a"
	
	placeholders = {
		"CRAWLER_CLASS_NAME" => name,
		"CRAWLER_NAME" => name.titleize,
		"CRAWLER_URL"  => url,
		"CRAWLER_MATCHER" => matcher
	}

	puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"

	File.open(template_path, 'r') do |tmpl|
		File.open(dest_path, 'w') do |crawler|  
			while line = tmpl.gets  
				#puts line
				placeholders.each do |k, v|
					line.gsub!(k, v)
				end
				
				crawler.puts line
			end  
		end
	end  

	return 0
end

#get_crawlers(args, options = @options) ⇒ Object

Get crawlers passd to cmd-line



363
364
365
366
367
368
369
370
371
372
373
374
# File 'lib/apollo_crawler/program/crawler_program.rb', line 363

def get_crawlers(args, options = @options)
	crawlers = []
	if(args.length > 0)
		crawlers << args.shift
	end

	if(options[:run_all])
		crawlers = @crawlers.keys
	end

	return crawlers
end

#get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses) ⇒ Object



315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/apollo_crawler/program/crawler_program.rb', line 315

def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
	res = []
	crawlers.each do |crawler|
		next if crawler.nil?

		crawler_classes.each do |klass|
			next if klass.nil?

			crawler_name = crawler.to_s.split('::').last.downcase
			klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "")

			# puts "#{crawler_name} => #{klass_name}"

			if crawler_name == klass_name || crawler_name == "#{klass_name}crawler"
				res << klass
				break
			end
		end
	end
	res
end

#init_additional_crawlers(dirs) ⇒ Object



401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# File 'lib/apollo_crawler/program/crawler_program.rb', line 401

def init_additional_crawlers(dirs)
	# puts "Initializing aditional crawlers ..."
	dirs.each do |dir|
		if(@options[:verbose])
			puts "Registering additional crawler dir '#{dir}'"
		end

		Dir.glob("#{dir}/*.rb").each do |f| 
			if(@options[:verbose])
				puts "Registering crawler '#{f}'"
			end
			require f
		end
	end
end

#init_optionsObject

Initialize command-line options



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/apollo_crawler/program/crawler_program.rb', line 114

def init_options()
	@options[:env] = Apollo::ENV	

	@options[:doc_limit] = nil
	@options[:verbose] = false
	@options[:version] = nil
	
	@options[:cache_dirs] = [
		RbConfig::CACHES_DIR
	]
	
	@options[:crawler_dirs] = [
		RbConfig::CRAWLERS_DIR
	]
	
	@options[:formatter_dirs] = [
		RbConfig::FORMATTERS_DIR
	]

	@options[:generate_crawler] = nil
end

#init_options_parserObject



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/apollo_crawler/program/crawler_program.rb', line 136

def init_options_parser()
	@optparser = OptionParser.new do | opts |
		opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"

		opts.separator ""
    			opts.separator "Specific options:"

		# This displays the help screen, all programs are
		# assumed to have this option.
		opts.on('-h', '--help', 'Display this screen') do
			@options[:show_help] = true
		end

		opts.on('-a', '--all', 'Run all crawlers') do
			@options[:run_all] = true
		end	

		opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name|
			@options[:env] = name
		end

		opts.on('-f', '--format [NAME]', "Formatter used") do |name|
			@options[:formatter] = name
		end

		opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
			@options[:generate_crawler] = name
		end

		opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
			@options[:crawler_dirs] << path

			init_additional_crawlers([path])
		end

		opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
			@options[:doc_limit] = count.to_i
		end

		opts.on('-v', '--verbose', 'Enable verbose output') do
			@options[:verbose] = true
		end

		opts.on('-V', '--version', 'Show version info') do
			@options[:version] = true
		end

		opts.on('-l', '--list-crawlers', 'List of crawlers') do
			@options[:list_crawlers] = true
		end

		opts.on(nil, '--list-formatters', 'List of formatters available') do
			@options[:list_formatters] = true
		end			

		# opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query|
		# 	@options[:query] = query
		# end	

		opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
			@options[:silent] = true
		end	
	end
end

#init_program(args) ⇒ Object

Init program



418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
# File 'lib/apollo_crawler/program/crawler_program.rb', line 418

def init_program(args)
	init_options()
	init_options_parser()

	CrawlerProgram.register_modules()

	parse_options(args)

	init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)

	load_config_file()

	res = process_options(args)
	if res != nil
		return res
	end

	return nil
end

#init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options) ⇒ Object



376
377
378
379
380
381
382
383
384
385
386
387
388
# File 'lib/apollo_crawler/program/crawler_program.rb', line 376

def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options)			
	dirs.each do |dir|
		if(File.directory?(dir) == false)
			if(options[:verbose])
				puts "Creating '#{dir}'"
			end

			FileUtils.mkpath(dir)
		end
	end

	init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))			
end

#init_user_config_file(config_path, dest_path, options = @options) ⇒ Object



390
391
392
393
394
395
396
397
398
399
# File 'lib/apollo_crawler/program/crawler_program.rb', line 390

def init_user_config_file(config_path, dest_path, options = @options)
	# Create user config file
	if(File.exists?(config_path) && File.exists?(dest_path) == false)
		if(options[:verbose])
			puts "Creating user config file '#{config_path}' => '#{dest_path}'"
		end

		FileUtils.cp(config_path, dest_path)
	end
end

#load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH) ⇒ Object

Load global options first Merge it with local options (if they exists)



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/apollo_crawler/program/crawler_program.rb', line 247

def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
	if(File.exists?(config))
		if(@options[:verbose])
			puts "Loading config '#{config}'"
		end
		
		require config
	else
		if(@options[:verbose])
			# TODO: Add support for initial rake task generation
			#          Something like this:
			#          rake config:init # Initializes config files with
			#            their defaults (if not exists already)        
			puts "Default config does not exist, skipping - '#{config}'"
		end
	end
end

#parse_options(args = ARGV) ⇒ Object

Parse the options passed to command-line



202
203
204
205
206
207
208
209
# File 'lib/apollo_crawler/program/crawler_program.rb', line 202

def parse_options(args = ARGV)
	# Parse the command-line. Remember there are two forms
	# of the parse method. The 'parse' method simply parses
	# ARGV, while the 'parse!' method parses ARGV and removes
	# any options found there, as well as any parameters for
	# the options. What's left is the list of files to resize.
	@optparser.parse!(args)
end

#process_docs_handler(docs, options = options, formatter) ⇒ Object



492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
# File 'lib/apollo_crawler/program/crawler_program.rb', line 492

def process_docs_handler(docs, options = options, formatter)
	if(docs.nil?)
		return docs
	end

	if(docs.kind_of?(Array) == false)
		docs = [docs]
	end

	if options[:silent] != true
		docs.each do |doc|
			puts formatter.format(doc)
		end
	end

	return docs
end

#process_options(args) ⇒ Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/apollo_crawler/program/crawler_program.rb', line 211

def process_options(args)
	if(@options[:version])
		puts Apollo::VERSION
		return 0
	end

	if(@options[:show_help])
		puts @optparser
		return 0
	end

	if(@options[:generate_crawler])
		name = @options[:generate_crawler]
		url = args.length > 0 ? args[0] : nil
		matcher = args.length > 1 ? args[1] : nil
		
		return self.generate_crawler(name, url, matcher)
	end

	if(@options[:list_formatters])
		objs = Apollo::Formatter::BaseFormatter.subclasses
		CrawlerProgram.console_table(['name', 'self.class'], objs)
		return 0
	end

	if(@options[:list_crawlers])
		objs = Apollo::Crawler::BaseCrawler.subclasses
		CrawlerProgram.console_table(['name', 'self.class'], objs)
		return 0
	end

	return nil
end

#request_exit(code = 0) ⇒ Object



482
483
484
485
486
487
488
489
490
# File 'lib/apollo_crawler/program/crawler_program.rb', line 482

def request_exit(code = 0)
	begin
		exit(0)
	rescue SystemExit => e
		# puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
	end

	return code
end

#run(args = ARGV) ⇒ Object

Run Program



447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/apollo_crawler/program/crawler_program.rb', line 447

def run(args = ARGV)
	res_code = init_program(args)

	if res_code.nil? == false
		return request_exit(res_code)
	end

	if(@options[:verbose])
		puts "Running environment '#{@options[:env]}'"
	end

	# Look for query
	if(@options[:query])
		res_code = run_query(@options[:query], @options)
		return request_exit(res_code)
	end

	# Parse remaining arguments as crawlers
	crawler_names = get_crawlers(args)
	if(crawler_names.nil? || crawler_names.empty?)
		puts @optparser
		return request_exit(0)
	end	

	# Get crawlers by their names
	crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses)
	if(crawlers.nil? || crawlers.empty?)
		puts @optparser
		return request_exit(0)
	end	

	res_code = run_crawlers(crawlers, args, @options)
	return request_exit(res_code)
end

#run_crawlers(crawlers, args, options = @options) ⇒ Object



337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/apollo_crawler/program/crawler_program.rb', line 337

def run_crawlers(crawlers, args, options = @options)
	crawlers.each do |crawler|
		if(options[:verbose])
			puts "Running '#{crawler}'"
		end

		opts = {
			:doc_limit => options[:doc_limit]
		}

		# Run crawlers
		instance = crawler.new

		if(args.nil? || args.empty?)
			args = instance.url
		end

		res = instance.etl(args, opts) do | docs |
			process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new)
		end
	end

	return 0
end

#run_query(query, options = {}) ⇒ Object



438
439
440
441
442
443
444
# File 'lib/apollo_crawler/program/crawler_program.rb', line 438

def run_query(query, options = {})
	if(options[:verbose])
		puts "Investigating query '#{query}'"
	end

	return 0
end