Class: Apollo::CrawlerProgram

Inherits:
BaseProgram show all
Defined in:
lib/apollo_crawler/program/crawler_program.rb

Constant Summary

Constants inherited from BaseProgram

BaseProgram::CONFIG_DIR, BaseProgram::DEFAULT_OPTIONS

Instance Attribute Summary

Attributes inherited from BaseProgram

#amqp, #config, #mongo, #options, #optparser

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from BaseProgram

get_config_path, #init_amqp, #init_mongo, #init_seeds, #init_seeds_crawlers, #load_config, #load_configs, require_files

Constructor Details

#initializeCrawlerProgram

Initializer - Constructor



70
71
72
73
74
75
76
77
78
# File 'lib/apollo_crawler/program/crawler_program.rb', line 70

def initialize
  super
  
  @options = {}

  at_exit { 
    at_exit_handler
  }
end

Class Method Details

.console_table(headings, rows) ⇒ Object

Show tabular data in form of CLI table



98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/apollo_crawler/program/crawler_program.rb', line 98

def self.console_table(headings, rows)
  rows = rows.map do |o| 
    i = o.new

    res = []
    headings.each do |h|
      res << i.instance_eval(h)
    end
    res
  end

  table = Terminal::Table.new :headings => headings, :rows => rows
  puts table
end

.get_modules_paths(modules = APOLLO_CRAWLER_MODULES) ⇒ Object



80
81
82
83
84
85
86
87
88
# File 'lib/apollo_crawler/program/crawler_program.rb', line 80

def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
  res = modules.map do |name|
    Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path|
      path
    end
  end

  res.flatten.sort
end

.register_modules(modules = APOLLO_CRAWLER_MODULES) ⇒ Object



90
91
92
93
94
95
# File 'lib/apollo_crawler/program/crawler_program.rb', line 90

def self.register_modules(modules = APOLLO_CRAWLER_MODULES)
  get_modules_paths(modules).each do |file| 
    # puts "Adding module '#{file}'"
    require file
  end
end

Instance Method Details

#at_exit_handlerObject

At Exit handler



511
512
513
514
515
516
517
518
519
520
521
# File 'lib/apollo_crawler/program/crawler_program.rb', line 511

def at_exit_handler()
  # if(@options[:verbose])
  #  puts "Running at_exit_handler" 
  # end

  # TODO: Flush caches
  # TODO: End gracefully

  # Force exit event machine
  # EventMachine.stop
end

#generate_crawler(name, url = nil, matcher = nil, options = @options) ⇒ Object



265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/apollo_crawler/program/crawler_program.rb', line 265

def generate_crawler(name, url = nil, matcher = nil, options = @options)
  name = name.titleize.gsub(" ", "")

  if(@options[:verbose])
    puts "Generating new crawler '#{name}'"
  end

  template_path = RbConfig::CRAWLER_TEMPLATE_PATH
  puts template_path
  if(File.exists?(template_path) == false)
    puts "Template file '#{template_path}' does not exists!"
    return -1
  end

  if(options[:verbose])
    puts "Using template '#{template_path}'"
  end

  unless(options[:silent])
    dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
  end

  url = url ? url : "http://some-url-here"
  matcher = matcher ? matcher : "//a"
  
  placeholders = {
    "CRAWLER_CLASS_NAME" => name,
    "CRAWLER_NAME" => name.titleize,
    "CRAWLER_URL"  => url,
    "CRAWLER_MATCHER" => matcher
  }

  puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"

  File.open(template_path, 'r') do |tmpl|
    File.open(dest_path, 'w') do |crawler|  
      while line = tmpl.gets  
        #puts line
        placeholders.each do |k, v|
          line.gsub!(k, v)
        end
        
        crawler.puts line
      end  
    end
  end  

  return 0
end

#get_crawlers(args, options = @options) ⇒ Object

Get crawlers passd to cmd-line



363
364
365
366
367
368
369
370
371
372
373
374
# File 'lib/apollo_crawler/program/crawler_program.rb', line 363

def get_crawlers(args, options = @options)
  crawlers = []
  if(args.length > 0)
    crawlers << args.shift
  end

  if(options[:run_all])
    crawlers = @crawlers.keys
  end

  return crawlers
end

#get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses) ⇒ Object



315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/apollo_crawler/program/crawler_program.rb', line 315

def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
  res = []
  crawlers.each do |crawler|
    next if crawler.nil?

    crawler_classes.each do |klass|
      next if klass.nil?

      crawler_name = crawler.to_s.split('::').last.downcase
      klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "")

      # puts "#{crawler_name} => #{klass_name}"

      if crawler_name == klass_name || crawler_name == "#{klass_name}crawler"
        res << klass
        break
      end
    end
  end
  res
end

#init_additional_crawlers(dirs) ⇒ Object



401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# File 'lib/apollo_crawler/program/crawler_program.rb', line 401

def init_additional_crawlers(dirs)
  # puts "Initializing aditional crawlers ..."
  dirs.each do |dir|
    if(@options[:verbose])
      puts "Registering additional crawler dir '#{dir}'"
    end

    Dir.glob("#{dir}/*.rb").each do |f| 
      if(@options[:verbose])
        puts "Registering crawler '#{f}'"
      end
      require f
    end
  end
end

#init_optionsObject

Initialize command-line options



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/apollo_crawler/program/crawler_program.rb', line 114

def init_options()
  @options[:env] = Apollo::ENV 

  @options[:doc_limit] = nil
  @options[:verbose] = false
  @options[:version] = nil
  
  @options[:cache_dirs] = [
    RbConfig::CACHES_DIR
  ]
  
  @options[:crawler_dirs] = [
    RbConfig::CRAWLERS_DIR
  ]
  
  @options[:formatter_dirs] = [
    RbConfig::FORMATTERS_DIR
  ]

  @options[:generate_crawler] = nil
end

#init_options_parserObject



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/apollo_crawler/program/crawler_program.rb', line 136

def init_options_parser()
  @optparser = OptionParser.new do | opts |
    opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"

    opts.separator ""
          opts.separator "Specific options:"

    # This displays the help screen, all programs are
    # assumed to have this option.
    opts.on('-h', '--help', 'Display this screen') do
      @options[:show_help] = true
    end

    opts.on('-a', '--all', 'Run all crawlers') do
      @options[:run_all] = true
    end 

    opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name|
      @options[:env] = name
    end

    opts.on('-f', '--format [NAME]', "Formatter used") do |name|
      @options[:formatter] = name
    end

    opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
      @options[:generate_crawler] = name
    end

    opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
      @options[:crawler_dirs] << path

      init_additional_crawlers([path])
    end

    opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
      @options[:doc_limit] = count.to_i
    end

    opts.on('-v', '--verbose', 'Enable verbose output') do
      @options[:verbose] = true
    end

    opts.on('-V', '--version', 'Show version info') do
      @options[:version] = true
    end

    opts.on('-l', '--list-crawlers', 'List of crawlers') do
      @options[:list_crawlers] = true
    end

    opts.on(nil, '--list-formatters', 'List of formatters available') do
      @options[:list_formatters] = true
    end     

    # opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query|
    #   @options[:query] = query
    # end 

    opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
      @options[:silent] = true
    end 
  end
end

#init_program(args) ⇒ Object

Init program



418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
# File 'lib/apollo_crawler/program/crawler_program.rb', line 418

def init_program(args)
  init_options()
  init_options_parser()

  CrawlerProgram.register_modules()

  parse_options(args)

  init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)

  load_config_file()

  res = process_options(args)
  if res != nil
    return res
  end

  return nil
end

#init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options) ⇒ Object



376
377
378
379
380
381
382
383
384
385
386
387
388
# File 'lib/apollo_crawler/program/crawler_program.rb', line 376

def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options)      
  dirs.each do |dir|
    if(File.directory?(dir) == false)
      if(options[:verbose])
        puts "Creating '#{dir}'"
      end

      FileUtils.mkpath(dir)
    end
  end

  init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))      
end

#init_user_config_file(config_path, dest_path, options = @options) ⇒ Object



390
391
392
393
394
395
396
397
398
399
# File 'lib/apollo_crawler/program/crawler_program.rb', line 390

def init_user_config_file(config_path, dest_path, options = @options)
  # Create user config file
  if(File.exists?(config_path) && File.exists?(dest_path) == false)
    if(options[:verbose])
      puts "Creating user config file '#{config_path}' => '#{dest_path}'"
    end

    FileUtils.cp(config_path, dest_path)
  end
end

#load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH) ⇒ Object

Load global options first Merge it with local options (if they exists)



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/apollo_crawler/program/crawler_program.rb', line 247

def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
  if(File.exists?(config))
    if(@options[:verbose])
      puts "Loading config '#{config}'"
    end
    
    require config
  else
    if(@options[:verbose])
      # TODO: Add support for initial rake task generation
      #          Something like this:
      #          rake config:init # Initializes config files with
      #            their defaults (if not exists already)        
      puts "Default config does not exist, skipping - '#{config}'"
    end
  end
end

#parse_options(args = ARGV) ⇒ Object

Parse the options passed to command-line



202
203
204
205
206
207
208
209
# File 'lib/apollo_crawler/program/crawler_program.rb', line 202

def parse_options(args = ARGV)
  # Parse the command-line. Remember there are two forms
  # of the parse method. The 'parse' method simply parses
  # ARGV, while the 'parse!' method parses ARGV and removes
  # any options found there, as well as any parameters for
  # the options. What's left is the list of files to resize.
  @optparser.parse!(args)
end

#process_docs_handler(docs, options = options, formatter) ⇒ Object



492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
# File 'lib/apollo_crawler/program/crawler_program.rb', line 492

def process_docs_handler(docs, options = options, formatter)
  if(docs.nil?)
    return docs
  end

  if(docs.kind_of?(Array) == false)
    docs = [docs]
  end

  if options[:silent] != true
    docs.each do |doc|
      puts formatter.format(doc)
    end
  end

  return docs
end

#process_options(args) ⇒ Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/apollo_crawler/program/crawler_program.rb', line 211

def process_options(args)
  if(@options[:version])
    puts Apollo::VERSION
    return 0
  end

  if(@options[:show_help])
    puts @optparser
    return 0
  end

  if(@options[:generate_crawler])
    name = @options[:generate_crawler]
    url = args.length > 0 ? args[0] : nil
    matcher = args.length > 1 ? args[1] : nil
    
    return self.generate_crawler(name, url, matcher)
  end

  if(@options[:list_formatters])
    objs = Apollo::Formatter::BaseFormatter.subclasses
    CrawlerProgram.console_table(['name', 'self.class'], objs)
    return 0
  end

  if(@options[:list_crawlers])
    objs = Apollo::Crawler::BaseCrawler.subclasses
    CrawlerProgram.console_table(['name', 'self.class'], objs)
    return 0
  end

  return nil
end

#request_exit(code = 0) ⇒ Object



482
483
484
485
486
487
488
489
490
# File 'lib/apollo_crawler/program/crawler_program.rb', line 482

def request_exit(code = 0)
  begin
    exit(0)
  rescue SystemExit => e
    # puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
  end

  return code
end

#run(args = ARGV) ⇒ Object

Run Program



447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/apollo_crawler/program/crawler_program.rb', line 447

def run(args = ARGV)
  res_code = init_program(args)

  if res_code.nil? == false
    return request_exit(res_code)
  end

  if(@options[:verbose])
    puts "Running environment '#{@options[:env]}'"
  end

  # Look for query
  if(@options[:query])
    res_code = run_query(@options[:query], @options)
    return request_exit(res_code)
  end

  # Parse remaining arguments as crawlers
  crawler_names = get_crawlers(args)
  if(crawler_names.nil? || crawler_names.empty?)
    puts @optparser
    return request_exit(0)
  end  

  # Get crawlers by their names
  crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses)
  if(crawlers.nil? || crawlers.empty?)
    puts @optparser
    return request_exit(0)
  end  

  res_code = run_crawlers(crawlers, args, @options)
  return request_exit(res_code)
end

#run_crawlers(crawlers, args, options = @options) ⇒ Object



337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/apollo_crawler/program/crawler_program.rb', line 337

def run_crawlers(crawlers, args, options = @options)
  crawlers.each do |crawler|
    if(options[:verbose])
      puts "Running '#{crawler}'"
    end

    opts = {
      :doc_limit => options[:doc_limit]
    }

    # Run crawlers
    instance = crawler.new

    if(args.nil? || args.empty?)
      args = instance.url
    end

    res = instance.etl(args, opts) do | docs |
      process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new)
    end
  end

  return 0
end

#run_query(query, options = {}) ⇒ Object



438
439
440
441
442
443
444
# File 'lib/apollo_crawler/program/crawler_program.rb', line 438

def run_query(query, options = {})
  if(options[:verbose])
    puts "Investigating query '#{query}'"
  end

  return 0
end