Class: MicroSpider

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL, SpiderCore::Behavior, SpiderCore::FieldDSL, SpiderCore::FollowDSL, SpiderCore::PaginationDSL
Defined in:
lib/micro_spider.rb

Instance Attribute Summary collapse

Attributes included from SpiderCore::PaginationDSL

#next_page, #skip_pages

Attributes included from SpiderCore::FollowDSL

#skip_followers

Instance Method Summary collapse

Methods included from SpiderCore::PaginationDSL

#keep_eyes_on_next_page

Methods included from SpiderCore::FollowDSL

#follow

Methods included from SpiderCore::FieldDSL

#css_field, #css_fields, #field, #fields, #foreach, #xpath_field, #xpath_fields

Constructor Details

#initialize(excretion = nil) ⇒ MicroSpider

Returns a new instance of MicroSpider.



37
38
39
40
41
42
43
44
45
46
# File 'lib/micro_spider.rb', line 37

def initialize(excretion = nil)
  @paths            = []
  @actions          = []
  @setted_variables = {}
  @timeout          = 120
  @excretion        = excretion || { status: 'inprogress', results: [] }
  @logger           = Logger.new(STDOUT)
  @visited_paths    = Set.new
  @broken_paths     = []
end

Instance Attribute Details

#actionsObject

Returns the value of attribute actions.



35
36
37
# File 'lib/micro_spider.rb', line 35

def actions
  @actions
end

#broken_pathsObject (readonly)

Returns the value of attribute broken_paths.



34
35
36
# File 'lib/micro_spider.rb', line 34

def broken_paths
  @broken_paths
end

#current_locationObject (readonly)

Returns the value of attribute current_location.



34
35
36
# File 'lib/micro_spider.rb', line 34

def current_location
  @current_location
end

#delayObject

Returns the value of attribute delay.



34
35
36
# File 'lib/micro_spider.rb', line 34

def delay
  @delay
end

#excretionObject (readonly)

Returns the value of attribute excretion.



34
35
36
# File 'lib/micro_spider.rb', line 34

def excretion
  @excretion
end

#loggerObject

Returns the value of attribute logger.



35
36
37
# File 'lib/micro_spider.rb', line 35

def logger
  @logger
end

#pathsObject (readonly)

Returns the value of attribute paths.



34
35
36
# File 'lib/micro_spider.rb', line 34

def paths
  @paths
end

#recipeObject

Returns the value of attribute recipe.



35
36
37
# File 'lib/micro_spider.rb', line 35

def recipe
  @recipe
end

#skip_set_entranceObject

Returns the value of attribute skip_set_entrance.



35
36
37
# File 'lib/micro_spider.rb', line 35

def skip_set_entrance
  @skip_set_entrance
end

#timeoutObject

Returns the value of attribute timeout.



35
36
37
# File 'lib/micro_spider.rb', line 35

def timeout
  @timeout
end

#visited_pathsObject (readonly)

Returns the value of attribute visited_paths.



34
35
36
# File 'lib/micro_spider.rb', line 34

def visited_paths
  @visited_paths
end

Instance Method Details

#click(locator, opts = {}, &block) ⇒ Object

Click the locator. This will trigger visit action and change current location.



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/micro_spider.rb', line 102

def click(locator, opts = {}, &block)
  actions << lambda {
    path = find_link(locator, opts)[:href] rescue nil
    raise SpiderCore::ClickPathNotFound, "#{locator} not found" if path.nil?
    if block_given?
      spider = self.spawn
      spider.entrance(path)
      spider.learn(&block)
      current_location[:click] ||= []
      current_location[:click] << spider.crawl[:results]
    else
      visit(path)
    end
  }
end

#completed?Boolean

Returns:

  • (Boolean)


295
296
297
# File 'lib/micro_spider.rb', line 295

def completed?
  excretion[:status] == 'completed'
end

#crawl(&block) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/micro_spider.rb', line 192

def crawl(&block)
  return excretion if completed?

  @paths.compact!
  path = nil
  loop do
    path = @paths.shift
    break if path.nil?
    break unless @visited_paths.include?(path)
  end

  if path.nil?
    excretion[:status] = 'completed'
    return excretion
  end

  learn(@recipe) if @actions.empty?

  begin
    visit(path)
  rescue Timeout::Error => err
    @broken_paths << path
    logger.fatal("Timeout!!! execution expired when visit `#{path}`")
    logger.fatal(err)
  rescue SystemExit, Interrupt
    logger.fatal("SystemExit && Interrupt")
    exit!
  rescue Exception => err
    @broken_paths << path
    logger.fatal("Caught exception when visit `#{path}`")
    logger.fatal(err)
  else
    @visited_paths << path
    execute_actions
    yield(@current_location) if block_given?
    excretion[:results] << @current_location
  ensure
    @actions = []
    @skip_set_entrance = true
    crawl(&block)
  end

  excretion
end

#create_action(name, &block) ⇒ Object

Spider can create custom action when it is crawling.

Examples:

spider = MicroSpider.new

spider.create_action :save do |result|
  SomeClass.save(result)
end

spider.save

Parameters:

  • name (String)

    the name of action

  • block (Proc)

    the actions



258
259
260
261
# File 'lib/micro_spider.rb', line 258

def create_action(name, &block)
  action = proc { actions << lambda { block.call(current_location) } }
  metaclass.send :define_method, name, &action
end

#entrance(*path_or_paths) ⇒ Object

This will be the first path for spider to visit. If more than one entrance, the spider will crawl theme one by one.

Examples:

spider = MicroSpider.new
spider.site('http://google.com')
spider.entrance('/a')
spider.entrance('/b')

Parameters:

  • path_or_paths (String)

    one or more entrances



169
170
171
172
# File 'lib/micro_spider.rb', line 169

def entrance(*path_or_paths)
  return if @skip_set_entrance
  @paths += path_or_paths
end

#entrance_on_path(path, pattern, opts = {}, &block) ⇒ Object

Sometimes the entrances are on the page.

Examples:

spider = MicroSpider.new
spider.entrance_on_path('http://google.com', '.links a')

Parameters:

  • path (String)

    path to visit

  • pattern (String, Regexp)

    links pattern



182
183
184
185
186
187
188
189
190
# File 'lib/micro_spider.rb', line 182

def entrance_on_path(path, pattern, opts = {}, &block)
  return if @skip_set_entrance
  kind = opts[:kind] || :css
  visit(path)
  entrances = scan_all(kind, pattern, opts).map do |element|
    block_given? ? yield(element) : element[:href]
  end
  @paths += entrances.to_a
end

#execute_actionsObject



263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/micro_spider.rb', line 263

def execute_actions
  actions.delete_if { |action|
    begin
      Timeout::timeout(@timeout) { action.call }
    rescue Timeout::Error => err
      logger.fatal('Timeout!!! execution expired when execute action')
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      break
    rescue SpiderCore::ClickPathNotFound => err
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      break
    end
  }
end

#learn(recipe = nil, &block) ⇒ Object

Teach the spider behaviors and it will repeat to the end.

Examples:

spider = MicroSpider.new
spider.learn do
  entrance 'http://google.com'
end
spider.crawl
spider.learn("entrance 'http://google.com'")
spider.crawl
recipe = lambda {
  entrance 'http://google.com'
}
spider.learn(recipe)
spider.crawl

Parameters:

  • recipe (String, Proc) (defaults to: nil)

    the recipe be learned.



139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/micro_spider.rb', line 139

def learn(recipe = nil, &block)
  if block_given?
    instance_eval(&block)
    @recipe = block
  elsif recipe.is_a?(Proc)
    instance_eval(&recipe)
    @recipe = recipe
  elsif recipe.is_a?(String)
    instance_eval(recipe)
    @recipe = recipe
  else
    self
  end
end

#metaclassObject



299
300
301
# File 'lib/micro_spider.rb', line 299

def metaclass
  class << self; self; end
end

#resetObject



237
238
239
240
241
242
243
# File 'lib/micro_spider.rb', line 237

def reset
  return unless completed?
  @paths            = visited_paths.to_a
  @excretion        = { status: 'inprogress', results: [] }
  @visited_paths    = Set.new
  @current_location = nil
end

#resultsObject



291
292
293
# File 'lib/micro_spider.rb', line 291

def results
  excretion[:results]
end

#set(name, value, opts = {}, &block) ⇒ Object

Set a variable. You can use it later.

Examples:

Set a variable

spider = MicroSpider.new
spider.set :id, '645'
spider.set :table, '.tb a', selector: :css
spider.set :table, '.tb a', selector: :css do |e|
  e['src']
end

Parameters:

  • name (String)

    the variable name

  • value (String)

    the variable value

  • opts (Hash) (defaults to: {})

    the options. can set selector with css or xpath



87
88
89
90
91
92
93
94
95
96
97
# File 'lib/micro_spider.rb', line 87

def set(name, value, opts = {}, &block)
  selector = opts.delete(:selector)
  if selector.nil?
    @setted_variables[name.to_s] = value
  else
    actions << lambda {
      elements = scan_all(selector, value, opts)
      @setted_variables[name.to_s] = block_given? ? yield(elements) : handle_element(elements.first)
    }
  end
end

#site(url) ⇒ Object



154
155
156
157
# File 'lib/micro_spider.rb', line 154

def site(url)
  return if @site
  Capybara.app_host = @excretion[:site] = @site = url
end

#spawnObject



280
281
282
283
284
285
286
287
288
289
# File 'lib/micro_spider.rb', line 280

def spawn
  spider = self.clone
  spider.instance_variable_set(:@paths, [])
  spider.instance_variable_set(:@actions, [])
  spider.instance_variable_set(:@visited_paths, Set.new)
  spider.instance_variable_set(:@broken_paths,  Set.new)
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
  spider.skip_set_entrance = false
  spider
end

#visit(path) ⇒ Object

Visit the path.

Examples:

Visit a path

spider = MicroSpider.new
spider.visit('/example')
spider.visit('http://google.com')

Parameters:

  • path (String)

    the path to visit, can be absolute path or relative path.

Raises:

  • (ArgumentError)


65
66
67
68
69
70
71
72
# File 'lib/micro_spider.rb', line 65

def visit(path)
  raise ArgumentError, "Path can't be nil or empty" if path.nil? || path.empty?
  sleep_or_not
  logger.info "Begin to visit #{path}."
  super(path)
  @current_location = {entrance: path}
  logger.info "Current location is #{path}."
end