Class: MicroSpider

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL, SpiderCore::Behavior, SpiderCore::FieldDSL, SpiderCore::FollowDSL, SpiderCore::PaginationDSL
Defined in:
lib/micro_spider.rb

Instance Attribute Summary collapse

Attributes included from SpiderCore::PaginationDSL

#next_page, #skip_pages

Attributes included from SpiderCore::FollowDSL

#skip_followers

Instance Method Summary collapse

Methods included from SpiderCore::PaginationDSL

#keep_eyes_on_next_page

Methods included from SpiderCore::FollowDSL

#follow

Methods included from SpiderCore::FieldDSL

#css_field, #css_fields, #field, #fields, #xpath_field, #xpath_fields

Constructor Details

#initialize(excretion = nil) ⇒ MicroSpider

Returns a new instance of MicroSpider.



27
28
29
30
31
32
33
34
35
# File 'lib/micro_spider.rb', line 27

def initialize(excretion = nil)
  @paths   = []
  @actions = []
  @timeout = 120
  @excretion = excretion || { status: 'inprogress', results: [] }
  @logger        = Logger.new(STDOUT)
  @visited_paths = Set.new
  @broken_paths  = []
end

Instance Attribute Details

#actionsObject

Returns the value of attribute actions.



25
26
27
# File 'lib/micro_spider.rb', line 25

def actions
  @actions
end

#broken_pathsObject (readonly)

Returns the value of attribute broken_paths.



24
25
26
# File 'lib/micro_spider.rb', line 24

def broken_paths
  @broken_paths
end

#current_locationObject (readonly)

Returns the value of attribute current_location.



24
25
26
# File 'lib/micro_spider.rb', line 24

def current_location
  @current_location
end

#delayObject

Returns the value of attribute delay.



24
25
26
# File 'lib/micro_spider.rb', line 24

def delay
  @delay
end

#excretionObject (readonly)

Returns the value of attribute excretion.



24
25
26
# File 'lib/micro_spider.rb', line 24

def excretion
  @excretion
end

#loggerObject

Returns the value of attribute logger.



25
26
27
# File 'lib/micro_spider.rb', line 25

def logger
  @logger
end

#pathsObject (readonly)

Returns the value of attribute paths.



24
25
26
# File 'lib/micro_spider.rb', line 24

def paths
  @paths
end

#recipeObject

Returns the value of attribute recipe.



25
26
27
# File 'lib/micro_spider.rb', line 25

def recipe
  @recipe
end

#skip_set_entranceObject

Returns the value of attribute skip_set_entrance.



25
26
27
# File 'lib/micro_spider.rb', line 25

def skip_set_entrance
  @skip_set_entrance
end

#timeoutObject

Returns the value of attribute timeout.



25
26
27
# File 'lib/micro_spider.rb', line 25

def timeout
  @timeout
end

#visited_pathsObject (readonly)

Returns the value of attribute visited_paths.



24
25
26
# File 'lib/micro_spider.rb', line 24

def visited_paths
  @visited_paths
end

Instance Method Details

#click(locator, opts = {}, &block) ⇒ Object

Click the locator. This will trigger visit action and change current location.



65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/micro_spider.rb', line 65

def click(locator, opts = {}, &block)
  actions << lambda {
    path = find_link(locator, opts)[:href]
    if block_given?
      spider = self.spawn
      spider.entrance(path)
      spider.learn(&block)
      current_location[:click] ||= []
      current_location[:click] << spider.crawl[:results]
    else
      visit(path)
    end
  }
end

#completed?Boolean

Returns:

  • (Boolean)


245
246
247
# File 'lib/micro_spider.rb', line 245

def completed?
  excretion[:status] == 'completed'
end

#crawl(&block) ⇒ Object



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/micro_spider.rb', line 154

def crawl(&block)
  return excretion if completed?

  @paths.compact!
  path = nil
  loop do
    path = @paths.shift
    break if path.nil?
    break unless @visited_paths.include?(path)
  end

  if path.nil?
    excretion[:status] = 'completed'
    return excretion
  end

  learn(@recipe) if @actions.empty?

  begin
    visit(path)
  rescue Timeout::Error => err
    @broken_paths << path
    logger.fatal("Timeout!!! execution expired when visit `#{path}`")
    logger.fatal(err)
  rescue SystemExit, Interrupt
    logger.fatal("SystemExit && Interrupt")
    exit!
  rescue Exception => err
    @broken_paths << path
    logger.fatal("Caught exception when visit `#{path}`")
    logger.fatal(err)
  else
    @visited_paths << path
    execute_actions
    yield(@current_location) if block_given?
    excretion[:results] << @current_location
  ensure
    @actions = []
    @skip_set_entrance = true
    crawl(&block)
  end

  excretion
end

#create_action(name, &block) ⇒ Object

Spider can create custom action when it is crawling.

Examples:

spider = MicroSpider.new

spider.create_action :save do |result|
  SomeClass.save(result)
end

spider.save

Parameters:

  • name (String)

    the name of action

  • block (Proc)

    the actions



212
213
214
215
# File 'lib/micro_spider.rb', line 212

def create_action(name, &block)
  action = proc { actions << lambda { block.call(current_location) } }
  metaclass.send :define_method, name, &action
end

#entrance(*path_or_paths) ⇒ Object

This will be the first path for spider to visit. If more than one entrance, the spider will crawl theme one by one.

Examples:

spider = MicroSpider.new
spider.site('http://google.com')
spider.entrance('/a')
spider.entrance('/b')

Parameters:

  • path_or_paths (String)

    one or more entrances



131
132
133
134
# File 'lib/micro_spider.rb', line 131

def entrance(*path_or_paths)
  return if @skip_set_entrance
  @paths += path_or_paths
end

#entrance_on_path(path, pattern, opts = {}, &block) ⇒ Object

Sometimes the entrances are on the page.

Examples:

spider = MicroSpider.new
spider.entrance_on_path('http://google.com', '.links a')

Parameters:

  • path (String)

    path to visit

  • pattern (String, Regexp)

    links pattern



144
145
146
147
148
149
150
151
152
# File 'lib/micro_spider.rb', line 144

def entrance_on_path(path, pattern, opts = {}, &block)
  return if @skip_set_entrance
  kind = opts[:kind] || :css
  visit(path)
  entrances = scan_all(kind, pattern, opts).map do |element|
    block_given? ? yield(element) : element[:href]
  end
  @paths += entrances.to_a
end

#execute_actionsObject



217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/micro_spider.rb', line 217

def execute_actions
  actions.delete_if { |action|
    begin
      Timeout::timeout(@timeout) { action.call }
    rescue Timeout::Error => err
      logger.fatal('Timeout!!! execution expired when execute action')
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      break
    end
  }
end

#learn(recipe = nil, &block) ⇒ Object

Teach the spider behaviors and it will repeat to the end.

Examples:

spider = MicroSpider.new
spider.learn do
  entrance 'http://google.com'
end
spider.crawl
spider.learn("entrance 'http://google.com'")
spider.crawl
recipe = lambda {
  entrance 'http://google.com'
}
spider.learn(recipe)
spider.crawl

Parameters:

  • recipe (String, Proc) (defaults to: nil)

    the recipe be learned.



101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/micro_spider.rb', line 101

def learn(recipe = nil, &block)
  if block_given?
    instance_eval(&block)
    @recipe = block
  elsif recipe.is_a?(Proc)
    instance_eval(&recipe)
    @recipe = recipe
  elsif recipe.is_a?(String)
    instance_eval(recipe)
    @recipe = recipe
  else
    self
  end
end

#metaclassObject



249
250
251
# File 'lib/micro_spider.rb', line 249

def metaclass
  class << self; self; end
end

#resultsObject



241
242
243
# File 'lib/micro_spider.rb', line 241

def results
  excretion[:results]
end

#site(url) ⇒ Object



116
117
118
119
# File 'lib/micro_spider.rb', line 116

def site(url)
  return if @site
  Capybara.app_host = @excretion[:site] = @site = url
end

#spawnObject



230
231
232
233
234
235
236
237
238
239
# File 'lib/micro_spider.rb', line 230

def spawn
  spider = self.clone
  spider.instance_variable_set(:@paths, [])
  spider.instance_variable_set(:@actions, [])
  spider.instance_variable_set(:@visited_paths, Set.new)
  spider.instance_variable_set(:@broken_paths, Set.new)
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
  spider.skip_set_entrance = false
  spider
end

#visit(path) ⇒ Object

Visit the path.

Examples:

Visit a path

spider = MicroSpider.new
spider.visit('/example')
spider.visit('http://google.com')

Parameters:

  • path (String)

    the path to visit, can be absolute path or relative path.



54
55
56
57
58
59
60
# File 'lib/micro_spider.rb', line 54

def visit(path)
  sleep_or_not
  logger.info "Begin to visit #{path}."
  super(path)
  @current_location = {entrance: path}
  logger.info "Current location is #{path}."
end