Class: MicroSpider

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL, SpiderCore::Behavior, SpiderCore::FieldDSL, SpiderCore::FollowDSL, SpiderCore::PaginationDSL
Defined in:
lib/micro_spider.rb

Instance Attribute Summary collapse

Attributes included from SpiderCore::PaginationDSL

#next_page, #skip_pages

Attributes included from SpiderCore::FollowDSL

#skip_followers

Instance Method Summary collapse

Methods included from SpiderCore::PaginationDSL

#keep_eyes_on_next_page

Methods included from SpiderCore::FollowDSL

#follow

Methods included from SpiderCore::FieldDSL

#field, #fields, #foreach

Constructor Details

#initialize(excretion = nil, selector: :css) ⇒ MicroSpider

Returns a new instance of MicroSpider.



38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/micro_spider.rb', line 38

def initialize(excretion = nil, selector: :css)
  @selector         = selector
  @paths            = []
  @actions          = []
  @setted_variables = {}
  @timeout          = 120
  @status           = 'pending'
  @excretion        = excretion || SpiderCore::Excretion.new
  @logger           = Logger.new(STDOUT)
  @visited_paths    = Set.new
  @broken_paths     = []
end

Instance Attribute Details

#actionsObject

Returns the value of attribute actions.



36
37
38
# File 'lib/micro_spider.rb', line 36

def actions
  @actions
end

#broken_pathsObject (readonly)

Returns the value of attribute broken_paths.



35
36
37
# File 'lib/micro_spider.rb', line 35

def broken_paths
  @broken_paths
end

#current_locationObject (readonly)

Returns the value of attribute current_location.



35
36
37
# File 'lib/micro_spider.rb', line 35

def current_location
  @current_location
end

#delayObject

Returns the value of attribute delay.



35
36
37
# File 'lib/micro_spider.rb', line 35

def delay
  @delay
end

#excretionObject (readonly)

Returns the value of attribute excretion.



35
36
37
# File 'lib/micro_spider.rb', line 35

def excretion
  @excretion
end

#loggerObject

Returns the value of attribute logger.



36
37
38
# File 'lib/micro_spider.rb', line 36

def logger
  @logger
end

#pathsObject (readonly)

Returns the value of attribute paths.



35
36
37
# File 'lib/micro_spider.rb', line 35

def paths
  @paths
end

#recipeObject

Returns the value of attribute recipe.



36
37
38
# File 'lib/micro_spider.rb', line 36

def recipe
  @recipe
end

#selectorObject

Returns the value of attribute selector.



36
37
38
# File 'lib/micro_spider.rb', line 36

def selector
  @selector
end

#skip_set_entranceObject

Returns the value of attribute skip_set_entrance.



36
37
38
# File 'lib/micro_spider.rb', line 36

def skip_set_entrance
  @skip_set_entrance
end

#timeoutObject

Returns the value of attribute timeout.



36
37
38
# File 'lib/micro_spider.rb', line 36

def timeout
  @timeout
end

#visited_pathsObject (readonly)

Returns the value of attribute visited_paths.



35
36
37
# File 'lib/micro_spider.rb', line 35

def visited_paths
  @visited_paths
end

Instance Method Details

#click(locator, opts = {}, &block) ⇒ Object

Click the locator. This will trigger visit action and change current location.



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/micro_spider.rb', line 104

def click(locator, opts = {}, &block)
  actions << lambda {
    path = find_link(locator, opts)[:href] rescue nil
    raise SpiderCore::ClickPathNotFound, "#{locator} not found" if path.nil?
    if block_given?
      spider = self.spawn
      spider.entrance(path)
      spider.learn(&block)
      put(
        "click::#{path}", spider.crawl
      )
    else
      visit(path)
    end
  }
end

#completed?Boolean

Returns:

  • (Boolean)


312
313
314
# File 'lib/micro_spider.rb', line 312

def completed?
  @status == 'completed'
end

#crawl(&block) ⇒ Object



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/micro_spider.rb', line 199

def crawl(&block)
  return excretion if completed?

  @paths.compact!
  path = nil
  loop do
    path = @paths.shift
    break if path.nil?
    break unless @visited_paths.include?(path)
  end

  if path.nil?
    complete
    return excretion
  end

  learn(@recipe) if @actions.empty?

  begin
    visit(path)
    @status = 'inprogress'
  rescue Timeout::Error => err
    @broken_paths << path
    logger.fatal("Timeout!!! execution expired when visit `#{path}`")
    logger.fatal(err)
  rescue SystemExit, Interrupt
    logger.fatal("SystemExit && Interrupt")
    @status = 'exit'
    exit!
  rescue Exception => err
    @broken_paths << path
    logger.fatal("Caught exception when visit `#{path}`")
    logger.fatal(err)
    logger.fatal(err.backtrace.join("\n"))
  else
    @visited_paths << path
    execute_actions
    #yield(@current_location) if block_given?
    @excretion = @excretion.put(path, @current_location)
  ensure
    @actions = []
    @skip_set_entrance = true
    crawl(&block)
  end

  excretion
end

#create_action(name, &block) ⇒ Object

Spider can create custom action when it is crawling.

Examples:

spider = MicroSpider.new

spider.create_action :save do |result|
  SomeClass.save(result)
end

spider.save

Parameters:

  • name (String)

    the name of action

  • block (Proc)

    the actions



269
270
271
272
# File 'lib/micro_spider.rb', line 269

def create_action(name, &block)
  action = proc { actions << lambda { block.call(@excretion) } }
  metaclass.send :define_method, name, &action
end

#entrance(*path_or_paths) ⇒ Object

This will be the first path for spider to visit. If more than one entrance, the spider will crawl theme one by one.

Examples:

spider = MicroSpider.new
spider.site('http://google.com')
spider.entrance('/a')
spider.entrance('/b')

Parameters:

  • path_or_paths (String)

    one or more entrances



172
173
174
175
# File 'lib/micro_spider.rb', line 172

def entrance(*path_or_paths)
  return if @skip_set_entrance
  @paths += path_or_paths
end

#entrance_on(pattern, path: '/', attr: :href) ⇒ Object

Sometimes the entrances are on the page.

Examples:

spider = MicroSpider.new
spider.entrance_on('.links a')
spider.entrance_on('.links a', path: '/a')

Parameters:

  • path (String) (defaults to: '/')

    path to visit

  • pattern (String, Regexp)

    links pattern



191
192
193
194
195
196
197
# File 'lib/micro_spider.rb', line 191

def entrance_on(pattern, path: '/', attr: :href)
  return if @skip_set_entrance

  visit(path)
  entrances = scan_all(pattern).map{ |element| element[attr] }
  @paths += entrances.to_a
end

#execute_actionsObject



274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/micro_spider.rb', line 274

def execute_actions
  actions.delete_if { |action|
    begin
      Timeout::timeout(@timeout) { action.call }
    rescue Timeout::Error => err
      logger.fatal('Timeout!!! execution expired when execute action')
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      @visited_paths.pop
      break
    rescue SpiderCore::ClickPathNotFound => err
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      @visited_paths.pop
      break
    end
  }
end

#get(field) ⇒ Object



320
321
322
323
324
325
# File 'lib/micro_spider.rb', line 320

def get(field)
  @_deep_fetch ||= excretion.extend Hashie::Extensions::DeepFind
  result = @_deep_fetch.deep_find_all(field.to_s)
  return if result.nil?
  result.length == 1 ? result.pop : result
end

#learn(recipe = nil, &block) ⇒ Object

Teach the spider behaviors and it will repeat to the end.

Examples:

spider = MicroSpider.new
spider.learn do
  entrance 'http://google.com'
end
spider.crawl
spider.learn("entrance 'http://google.com'")
spider.crawl
recipe = lambda {
  entrance 'http://google.com'
}
spider.learn(recipe)
spider.crawl

Parameters:

  • recipe (String, Proc) (defaults to: nil)

    the recipe be learned.



142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/micro_spider.rb', line 142

def learn(recipe = nil, &block)
  if block_given?
    instance_eval(&block)
    @recipe = block
  elsif recipe.is_a?(Proc)
    instance_eval(&recipe)
    @recipe = recipe
  elsif recipe.is_a?(String)
    instance_eval(recipe)
    @recipe = recipe
  else
    self
  end
end

#metaclassObject



316
317
318
# File 'lib/micro_spider.rb', line 316

def metaclass
  class << self; self; end
end

#pageObject

The default page is Capybara.current_session. Share one page may cause difficult issue, so here i separate it.



329
330
331
# File 'lib/micro_spider.rb', line 329

def page
  @page ||= Capybara::Session.new(Capybara.mode, Capybara.app)
end

#resetObject



247
248
249
250
251
252
253
254
# File 'lib/micro_spider.rb', line 247

def reset
  return unless completed?
  @paths            = visited_paths.to_a
  @status           = 'pending'
  @excretion        = nil
  @visited_paths    = Set.new
  @current_location = nil
end

#set(name, value) ⇒ Object

Set a variable. You can use it later.

Examples:

Set a variable

spider = MicroSpider.new
spider.set :id, '645'
spider.set :table, '.tb a', selector: :css
spider.set :table, '.tb a', selector: :css do |e|
  e['src']
end

Parameters:

  • name (String)

    the variable name

  • value (String)

    the variable value

  • opts (Hash)

    the options. can set selector with css or xpath



90
91
92
# File 'lib/micro_spider.rb', line 90

def set(name, value)
  @setted_variables[name.to_s] = value
end

#set_on(name, pattern, &block) ⇒ Object



94
95
96
97
98
99
# File 'lib/micro_spider.rb', line 94

def set_on(name, pattern, &block)
  actions << lambda {
    element = scan_first(pattern)
    @setted_variables[name.to_s] = block_given? ? yield(element) : handle_element(element)
  }
end

#site(url) ⇒ Object



157
158
159
160
# File 'lib/micro_spider.rb', line 157

def site(url)
  return if @site
  Capybara.app_host = @site = url
end

#spawn(&block) ⇒ Object

Examples:

spider = MicroSpider.new
kid = spider.spawn

or

kid = spider.spawn do
  ...
  ...
end


303
304
305
306
307
308
309
310
# File 'lib/micro_spider.rb', line 303

def spawn(&block)
  spider         = self.class.new
  spider.logger  = logger
  spider.timeout = timeout
  spider.site(@site)
  spider.learn(&block) if block_given?
  spider
end

#suicideObject

Because we don’t share the page, the connect may or maynot be killd, it will eat too much mem. Make this spider instance suicide. For now, specially for ‘capybara-webkit`



336
337
338
339
340
341
# File 'lib/micro_spider.rb', line 336

def suicide
  if Capybara.mode.to_s == 'webkit'
    @page.driver.browser.instance_variable_get(:@connection).send :kill_process
  end
  @page = nil
end

#visit(path) ⇒ Object

Visit the path.

Examples:

Visit a path

spider = MicroSpider.new
spider.visit('/example')
spider.visit('http://google.com')

Parameters:

  • path (String)

    the path to visit, can be absolute path or relative path.

Raises:

  • (ArgumentError)


68
69
70
71
72
73
74
75
# File 'lib/micro_spider.rb', line 68

def visit(path)
  raise ArgumentError, "Path can't be nil or empty" if path.nil? || path.empty?
  sleep_or_not
  logger.info "Begin to visit #{path}."
  super(path)
  @current_location = SpiderCore::Excretion['_path' => path]
  logger.info "Current location is #{path}."
end

#with(pattern, path:, &block) ⇒ Object



177
178
179
180
# File 'lib/micro_spider.rb', line 177

def with(pattern, path:, &block)
  visit(path)
  scan_all(pattern).map{ |element| yield(element) }
end