Class: Anemone::Core

Inherits:
Object
  • Object
show all
Defined in:
lib/anemone/core.rb

Constant Summary

DEFAULT_OPTS =
{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # disable verbose output
  :verbose => false,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Anemone/VERSION
  :user_agent => "Anemone/#{Anemone::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5,
  # storage engine defaults to Hash in +process_options+ if none specified
  :storage => nil,
  # Hash of cookie name => value to send with HTTP requests
  :cookies => nil,
  # accept cookies from the server and send them back?
  :accept_cookies => false,
  # skip any link with a query string? e.g. http://foo.com/?u=user
  :skip_query_strings => false,
  # proxy server hostname 
  :proxy_host => nil,
  # proxy server port number
  :proxy_port => false,
  # HTTP read timeout in seconds
  :read_timeout => nil
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

  • (_self)

Yield Parameters:

  • _self (Anemone::Core)

    the object that the method was called on



72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/anemone/core.rb', line 72

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts

  yield self if block_given?
end

Instance Attribute Details

#optsObject (readonly)

Hash of options for the crawl



26
27
28
# File 'lib/anemone/core.rb', line 26

def opts
  @opts
end

#pagesObject (readonly)

PageStore storing all Page objects encountered during the crawl



24
25
26
# File 'lib/anemone/core.rb', line 24

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl



89
90
91
92
93
94
# File 'lib/anemone/core.rb', line 89

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageStore after the crawl is finished



100
101
102
103
# File 'lib/anemone/core.rb', line 100

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.



140
141
142
143
# File 'lib/anemone/core.rb', line 140

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl



118
119
120
121
# File 'lib/anemone/core.rb', line 118

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns



127
128
129
130
131
132
133
134
# File 'lib/anemone/core.rb', line 127

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#runObject

Perform the crawl



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/anemone/core.rb', line 148

def run
  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

Add one ore more Regex patterns for URLs which should not be followed



109
110
111
112
# File 'lib/anemone/core.rb', line 109

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end