Class: Pupa::Runner

Inherits:
Object
  • Object
show all
Defined in:
lib/pupa/runner.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(processor_class, defaults = {}) ⇒ Runner

Returns a new instance of Runner.

Parameters:

  • a (Pupa::Processor)

    processor class

  • defaults (Hash) (defaults to: {})

    change any default options



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/pupa/runner.rb', line 10

def initialize(processor_class, defaults = {})
  @processor_class = processor_class

  @options = OpenStruct.new({
    actions:            [],
    tasks:              [],
    output_dir:         File.expand_path('_data', Dir.pwd),
    pipelined:          false,
    cache_dir:          File.expand_path('_cache', Dir.pwd),
    expires_in:         86400, # 1 day
    value_max_bytes:    1048576, # 1 MB
    memcached_username: nil,
    memcached_password: nil,
    database_url:       'mongodb://localhost:27017/pupa',
    validate:           true,
    level:              'INFO',
    dry_run:            false,
  }.merge(defaults))

  @actions = {
    'scrape' => 'Scrapes data from online sources',
    'import' => 'Imports scraped data into a database',
  }.map do |name,description|
    OpenStruct.new(name: name, description: description)
  end
end

Instance Attribute Details

#actionsObject (readonly)

Returns the value of attribute actions.



6
7
8
# File 'lib/pupa/runner.rb', line 6

def actions
  @actions
end

#optionsObject (readonly)

Returns the value of attribute options.



6
7
8
# File 'lib/pupa/runner.rb', line 6

def options
  @options
end

Instance Method Details

#add_action(attributes) ⇒ Object

Parameters:

  • attributes (Hash)

    the action's attributes

Options Hash (attributes):

  • :name (String)

    the action's label

  • :description (String)

    a description of the action



40
41
42
# File 'lib/pupa/runner.rb', line 40

def add_action(attributes)
  @actions << OpenStruct.new(attributes)
end

#optsOptionParser

Returns the command-line option parser.

Returns:

  • (OptionParser)

    the command-line option parser



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/pupa/runner.rb', line 47

def opts
  @opts ||= OptionParser.new do |opts|
    opts.program_name = File.basename($PROGRAM_NAME)
    opts.banner = "Usage: #{opts.program_name}"

    opts.separator ''
    opts.separator 'Actions:'

    names = @actions.map(&:name)
    padding = names.map(&:size).max
    @actions.each do |action|
      opts.separator "  #{action.name.ljust(padding)}  #{action.description}\n"
    end

    opts.separator ''
    opts.separator 'Tasks:'

    @processor_class.tasks.each do |task_name|
      opts.separator "  #{task_name}"
    end

    opts.separator ''
    opts.separator 'Specific options:'
    opts.on('-a', '--action ACTION', names, 'Select an action to run (you may give this switch multiple times)', "  (#{names.join(', ')})") do |v|
      options.actions << v
    end
    opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', "  (#{@processor_class.tasks.join(', ')})") do |v|
      options.tasks << v
    end
    opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379/0) in which to dump JSON documents') do |v|
      options.output_dir = v
    end
    opts.on('--pipelined', 'Dump JSON documents all at once') do |v|
      options.pipelined = v
    end
    opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
      options.cache_dir = v
    end
    opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
      options.expires_in = v
    end
    opts.on('--value_max_bytes BYTES', "The maximum Memcached item size") do |v|
      options.value_max_bytes = v
    end
    opts.on('--memcached_username USERNAME', "The Memcached username") do |v|
      options.memcached_username = v
    end
    opts.on('--memcached_password USERNAME', "The Memcached password") do |v|
      options.memcached_password = v
    end
    opts.on('-d', '--database_url', 'The database URL (e.g. mongodb://USER:PASSWORD@localhost:27017/pupa or postgres://USER:PASSWORD@localhost:5432/pupa') do |v|
      options.database_url = v
    end
    opts.on('--[no-]validate', 'Validate JSON documents') do |v|
      options.validate = v
    end
    opts.on('-v', '--verbose', 'Show all messages') do
      options.level = 'DEBUG'
    end
    opts.on('-q', '--quiet', 'Show only warning and error messages') do
      options.level = 'WARN'
    end
    opts.on('-s', '--silent', 'Show no messages') do
      options.level = 'UNKNOWN'
    end
    opts.on('-n', '--dry-run', 'Show the plan without running any actions') do
      options.dry_run = true
    end

    opts.separator ''
    opts.separator 'Common options:'
    opts.on_tail('-h', '--help', 'Show this message') do
      puts opts
      exit
    end
    opts.on_tail('-v', '--version', 'Show version') do
      puts Pupa::VERSION
      exit
    end
  end
end

#run(args, overrides = {}) ⇒ Object

Runs the action.

Examples:

Run from a command-line script


runner.run(ARGV)

Override the command-line options


runner.run(ARGV, expires_in: 3600) # 1 hour

Parameters:

  • args (Array)

    command-line arguments

  • overrides (Hash) (defaults to: {})

    any overridden options



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/pupa/runner.rb', line 141

def run(args, overrides = {})
  rest = opts.parse!(args)

  @options = OpenStruct.new(options.to_h.merge(overrides))

  if options.actions.empty?
    options.actions = %w(scrape import)
  end
  if options.tasks.empty?
    options.tasks = @processor_class.tasks
  end

  processor = @processor_class.new(options.output_dir,
    pipelined: options.pipelined,
    cache_dir: options.cache_dir,
    expires_in: options.expires_in,
    value_max_bytes: options.value_max_bytes,
    memcached_username: options.memcached_username,
    memcached_password: options.memcached_password,
    database_url: options.database_url,
    validate: options.validate,
    level: options.level,
    options: Hash[*rest])

  options.actions.each do |action|
    unless action == 'scrape' || processor.respond_to?(action)
      abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
    end
  end

  if %w(DEBUG INFO).include?(options.level)
    puts "processor: #{@processor_class}"
    puts "actions: #{options.actions.join(', ')}"
    puts "tasks: #{options.tasks.join(', ')}"
  end

  if options.level == 'DEBUG'
    %w(output_dir pipelined cache_dir expires_in value_max_bytes memcached_username memcached_password database_url validate level).each do |option|
      puts "#{option}: #{options[option]}"
    end
    unless rest.empty?
      puts "options: #{rest.join(' ')}"
    end
  end

  exit if options.dry_run

  report = {
    plan: {
      processor: @processor_class,
      options: Marshal.load(Marshal.dump(options)).to_h,
      arguments: rest,
    },
    start: Time.now.utc,
  }

  if options.actions.delete('scrape')
    processor.store.clear
    report[:scrape] = {}
    options.tasks.each do |task_name|
      report[:scrape][task_name] = processor.dump_scraped_objects(task_name)
    end
  end

  options.actions.each do |action|
    processor.send(action)
    if processor.report.key?(action.to_sym)
      report.update(action.to_sym => processor.report[action.to_sym])
    end
  end

  if %w(DEBUG INFO).include?(options.level)
    report[:end] = Time.now.utc
    report[:time] = report[:end] - report[:start]
    puts JSON.dump(report)
  end
end