Class: DaimonSkycrawlers::Processor::Spider

Inherits:
Base
  • Object
show all
Defined in:
lib/daimon_skycrawlers/processor/spider.rb

Overview

Web spider class. By default extract all links and follow.

Examples:

Google search result (2016-11-29)

spider = DaimonSkycrawlers::Processor::Spider.new
spider.configure do |s|
  s.link_rules = ".g .r a"
  s.extract_link do |element|
    element["data-href"]
  end
  s.link_message = { next: "detail" }
  s.next_page_link_rules = "a#pnnext"
  s.next_page_link_message = { next: "spider" }
end

Instance Attribute Summary collapse

Attributes inherited from Base

#storage

Instance Method Summary collapse

Methods inherited from Base

#process

Methods included from Configurable

#configure

Methods included from Callbacks

#after_process, #before_process, #clear_after_process_callbacks, #clear_before_process_callbacks, #run_after_process_callbacks, #run_before_process_callbacks

Constructor Details

#initializeSpider



45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 45

def initialize
  super
  @link_filters = []
  @doc = nil
  @links = nil
  @enqueue = true
  @link_rules = ["a"]
  @extract_link = ->(element) { element["href"] }
  @link_message = {}
  @next_page_link_rules = nil
  @extract_next_page_link = ->(element) { element["href"] }
  @next_page_link_message = {}
end

Instance Attribute Details

#enqueueObject

If true enqueue found links



34
35
36
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 34

def enqueue
  @enqueue
end

Specify hash literal to propagate arbitrary data next crawler/processor. This is for filtering message before crawler/processor processes the message.



43
44
45
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 43

def link_message=(value)
  @link_message = value
end

same as Nokogiri::XML::DocumentFragment#search In generally, we can set XPath or CSS selector.



34
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 34

attr_accessor :enqueue, :link_rules, :next_page_link_rules

Sets the attribute next_page_link_message



43
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 43

attr_writer :link_message, :next_page_link_message

same as Nokogiri::XML::DocumentFragment#search In generally, we can set XPath or CSS selector.



34
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 34

attr_accessor :enqueue, :link_rules, :next_page_link_rules

Instance Method Details

Append filter to reduce links found by link_rules

Yields:

  • (message)

    Similar to Array#select

Yield Parameters:

  • message (Hash)


66
67
68
69
70
71
72
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 66

def append_link_filter(filter = nil, &block)
  if block_given?
    @link_filters << block
  else
    @link_filters << filter if filter.respond_to?(:call)
  end
end

#call(message) ⇒ Object



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 101

def call(message)
  depth = Integer(message[:depth] || 2)
  return if depth <= 1
  page = storage.read(message)
  unless page
    log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
    return
  end
  @doc = Nokogiri::HTML(page.body)
  new_message = {
    depth: depth - 1,
  }
  link_message = new_message.merge(@link_message)
  links.each do |url|
    enqueue_url(url, link_message)
  end
  next_page_url = next_page_link
  if next_page_url
    next_page_link_message = new_message.merge(@next_page_link_message)
    enqueue_url(next_page_url, next_page_link_message)
  end
end

Register block to process element found by DaimonSkycrawlers::Processor::Spider#link_rules

Examples:

Default

->(element) { element["href"] }

Yields:

  • (element)

Yield Parameters:

  • element (Object)


82
83
84
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 82

def extract_link(&block)
  @extract_link = block
end

Register block to process element found by DaimonSkycrawlers::Processor::Spider#next_page_link_rules

Examples:

Default

->(element) { element["href"] }

Yields:

  • (element)

Yield Parameters:

  • element (Object)


94
95
96
# File 'lib/daimon_skycrawlers/processor/spider.rb', line 94

def extract_next_page_link(&block)
  @extract_next_page_link = block
end