Class: Spidr::Agent

Inherits:
Object
  • Object
show all
Includes:
Settings::UserAgent
Defined in:
lib/spidr/agent.rb,
lib/spidr/agent/events.rb,
lib/spidr/agent/robots.rb,
lib/spidr/agent/actions.rb,
lib/spidr/agent/filters.rb,
lib/spidr/agent/sanitizers.rb

Defined Under Namespace

Modules: Actions

Instance Attribute Summary collapse

Attributes included from Settings::UserAgent

#user_agent

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) {|agent| ... } ⇒ Agent

Creates a new Agent object.

Options Hash (options):

  • :open_timeout (Integer) — default: Spidr.open_timeout

    Optional open timeout.

  • :read_timeout (Integer) — default: Spidr.read_timeout

    Optional read timeout.

  • :ssl_timeout (Integer) — default: Spidr.ssl_timeout

    Optional ssl timeout.

  • :continue_timeout (Integer) — default: Spidr.continue_timeout

    Optional continue timeout.

  • :keep_alive_timeout (Integer) — default: Spidr.keep_alive_timeout

    Optional keep_alive timeout.

  • :proxy (Hash) — default: Spidr.proxy

    The proxy information to use.

  • :default_headers (Hash{String => String})

    Default headers to set for every request.

  • :host_header (String)

    The HTTP Host header to use with each request.

  • :host_headers (Hash{String,Regexp => String})

    The HTTP Host headers to use for specific hosts.

  • :user_agent (String) — default: Spidr.user_agent

    The User-Agent string to send with each requests.

  • :referer (String)

    The Referer URL to send with each request.

  • :delay (Integer) — default: 0

    The number of seconds to pause between each request.

  • :queue (Set, Array)

    The initial queue of URLs to visit.

  • :history (Set, Array)

    The initial list of visited URLs.

  • :limit (Integer)

    The maximum number of pages to visit.

  • :max_depth (Integer)

    The maximum link depth to follow.

  • :robots (Boolean) — default: Spidr.robots?

    Specifies whether robots.txt should be honored.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent for further configuration.

Yield Parameters:

  • agent (Agent)

    The newly created agent.

See Also:


177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/spidr/agent.rb', line 177

def initialize(options={})
  @host_header  = options[:host_header]
  @host_headers = {}

  if options[:host_headers]
    @host_headers.merge!(options[:host_headers])
  end

  @default_headers = {}

  if options[:default_headers]
    @default_headers.merge!(options[:default_headers])
  end

  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
  @referer    = options[:referer]

  @sessions   = SessionCache.new(options)
  @cookies    = CookieJar.new
  @authorized = AuthStore.new

  @running  = false
  @delay    = options.fetch(:delay,0)
  @history  = Set[]
  @failures = Set[]
  @queue    = []

  @limit     = options[:limit]
  @levels    = Hash.new(0)
  @max_depth = options[:max_depth]

  if options[:queue]
    self.queue = options[:queue]
  end

  if options[:history]
    self.history = options[:history]
  end

  initialize_sanitizers(options)
  initialize_filters(options)
  initialize_actions(options)
  initialize_events(options)

  if options.fetch(:robots,Spidr.robots?)
    initialize_robots
  end

  yield self if block_given?
end

Instance Attribute Details

#authorizedAuthStore

HTTP Authentication credentials


42
43
44
# File 'lib/spidr/agent.rb', line 42

def authorized
  @authorized
end

#cookiesCookieJar (readonly)

Cached cookies


79
80
81
# File 'lib/spidr/agent.rb', line 79

def cookies
  @cookies
end

#default_headersHash{String => String} (readonly)

HTTP Headers to use for every request

Since:

  • 0.6.0


37
38
39
# File 'lib/spidr/agent.rb', line 37

def default_headers
  @default_headers
end

#delayInteger

Delay in between fetching pages


52
53
54
# File 'lib/spidr/agent.rb', line 52

def delay
  @delay
end

#failuresSet<URI::HTTP>

List of unreachable URLs


62
63
64
# File 'lib/spidr/agent.rb', line 62

def failures
  @failures
end

#historySet<URI::HTTP> Also known as: visited_urls

History containing visited URLs


57
58
59
# File 'lib/spidr/agent.rb', line 57

def history
  @history
end

#host_headerString

HTTP Host Header to use


25
26
27
# File 'lib/spidr/agent.rb', line 25

def host_header
  @host_header
end

#host_headersHash{String,Regexp => String} (readonly)

HTTP Host Headers to use for specific hosts


30
31
32
# File 'lib/spidr/agent.rb', line 30

def host_headers
  @host_headers
end

#levelsHash{URI::HTTP => Integer} (readonly)

The visited URLs and their depth within a site


94
95
96
# File 'lib/spidr/agent.rb', line 94

def levels
  @levels
end

#limitInteger (readonly)

Maximum number of pages to visit.


84
85
86
# File 'lib/spidr/agent.rb', line 84

def limit
  @limit
end

#max_depthInteger (readonly)

Maximum depth


89
90
91
# File 'lib/spidr/agent.rb', line 89

def max_depth
  @max_depth
end

#queueArray<URI::HTTP> Also known as: pending_urls

Queue of URLs to visit


67
68
69
# File 'lib/spidr/agent.rb', line 67

def queue
  @queue
end

#refererString

Referer to use


47
48
49
# File 'lib/spidr/agent.rb', line 47

def referer
  @referer
end

#schemesObject

List of acceptable URL schemes to follow


7
8
9
# File 'lib/spidr/agent/filters.rb', line 7

def schemes
  @schemes
end

#sessionsSessionCache (readonly)

The session cache

Since:

  • 0.6.0


74
75
76
# File 'lib/spidr/agent.rb', line 74

def sessions
  @sessions
end

#strip_fragmentsObject

Specifies whether the Agent will strip URI fragments


7
8
9
# File 'lib/spidr/agent/sanitizers.rb', line 7

def strip_fragments
  @strip_fragments
end

#strip_queryObject

Specifies whether the Agent will strip URI queries


10
11
12
# File 'lib/spidr/agent/sanitizers.rb', line 10

def strip_query
  @strip_query
end

Class Method Details

.host(name, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the given host.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.

See Also:


295
296
297
298
# File 'lib/spidr/agent.rb', line 295

def self.host(name,options={},&block)
  agent = new(options.merge(host: name),&block)
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
end

.site(url, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the web-site located at the given URL.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.

See Also:


270
271
272
273
274
275
# File 'lib/spidr/agent.rb', line 270

def self.site(url,options={},&block)
  url = URI(url)

  agent = new(options.merge(host: url.host),&block)
  agent.start_at(url)
end

.start_at(url, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and begin spidering at the given URL.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.

See Also:


247
248
249
250
# File 'lib/spidr/agent.rb', line 247

def self.start_at(url,options={},&block)
  agent = new(options,&block)
  agent.start_at(url)
end

Instance Method Details

#all_headers {|headers| ... } ⇒ Object

Pass the headers from every response the agent receives to a given block.

Yields:

  • (headers)

    The block will be passed the headers of every response.

Yield Parameters:

  • headers (Hash)

    The headers from a response.


68
69
70
# File 'lib/spidr/agent/events.rb', line 68

def all_headers
  every_page { |page| yield page.headers }
end

#clearObject

Clears the history of the agent.


334
335
336
337
338
339
# File 'lib/spidr/agent.rb', line 334

def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end

#continue! {|page| ... } ⇒ Object

Continue spidering.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    The page to be visited.


40
41
42
43
# File 'lib/spidr/agent/actions.rb', line 40

def continue!(&block)
  @paused = false
  return run(&block)
end

#dequeueURI::HTTP (protected)

Dequeues a URL that will later be visited.


798
799
800
# File 'lib/spidr/agent.rb', line 798

def dequeue
  @queue.shift
end

#enqueue(url, level = 0) ⇒ Boolean

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.


534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
# File 'lib/spidr/agent.rb', line 534

def enqueue(url,level=0)
  url = sanitize_url(url)

  if (!(queued?(url)) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |url_block| url_block.call(url) }

      @every_url_like_blocks.each do |pattern,url_blocks|
        match = case pattern
                when Regexp
                  link =~ pattern
                else
                  (pattern == link) || (pattern == url)
                end

        if match
          url_blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end

    @queue << url
    @levels[url] = level
    return true
  end

  return false
end

#every_atom_doc {|doc| ... } ⇒ Object

Pass every Atom document that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every Atom document parsed.

Yield Parameters:

  • doc (Nokogiri::XML::Document)

    A parsed XML document.

See Also:


387
388
389
390
391
392
393
394
395
# File 'lib/spidr/agent/events.rb', line 387

def every_atom_doc
  every_page do |page|
    if (block_given? && page.atom?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_atom_page {|feed| ... } ⇒ Object

Pass every Atom feed that the agent visits to a given block.

Yields:

  • (feed)

    The block will be passed every Atom feed visited.

Yield Parameters:

  • feed (Page)

    A visited page.


451
452
453
454
455
# File 'lib/spidr/agent/events.rb', line 451

def every_atom_page
  every_page do |page|
    yield page if (block_given? && page.atom?)
  end
end

#every_bad_request_page {|page| ... } ⇒ Object

Pass every Bad Request page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Bad Request page visited.

Yield Parameters:

  • page (Page)

    A visited page.


140
141
142
143
144
# File 'lib/spidr/agent/events.rb', line 140

def every_bad_request_page
  every_page do |page|
    yield page if (block_given? && page.bad_request?)
  end
end

#every_css_page {|page| ... } ⇒ Object

Pass every CSS page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every CSS page visited.

Yield Parameters:

  • page (Page)

    A visited page.


421
422
423
424
425
# File 'lib/spidr/agent/events.rb', line 421

def every_css_page
  every_page do |page|
    yield page if (block_given? && page.css?)
  end
end

#every_doc {|doc| ... } ⇒ Object

Pass every HTML or XML document that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every HTML or XML document parsed.

Yield Parameters:

  • doc (Nokogiri::HTML::Document, Nokogiri::XML::Document)

    A parsed HTML or XML document.

See Also:


281
282
283
284
285
286
287
288
289
# File 'lib/spidr/agent/events.rb', line 281

def every_doc
  every_page do |page|
    if block_given?
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_failed_url {|url| ... } ⇒ Object

Pass each URL that could not be requested to the given block.

Yields:

  • (url)

    The block will be passed every URL that could not be requested.

Yield Parameters:

  • url (URI::HTTP)

    A failed URL.


26
27
28
29
# File 'lib/spidr/agent/events.rb', line 26

def every_failed_url(&block)
  @every_failed_url_blocks << block
  return self
end

#every_forbidden_page {|page| ... } ⇒ Object

Pass every Forbidden page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Forbidden page visited.

Yield Parameters:

  • page (Page)

    A visited page.


170
171
172
173
174
# File 'lib/spidr/agent/events.rb', line 170

def every_forbidden_page
  every_page do |page|
    yield page if (block_given? && page.forbidden?)
  end
end

#every_html_doc {|doc| ... } ⇒ Object

Pass every HTML document that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every HTML document parsed.

Yield Parameters:

  • doc (Nokogiri::HTML::Document)

    A parsed HTML document.

See Also:


302
303
304
305
306
307
308
309
310
# File 'lib/spidr/agent/events.rb', line 302

def every_html_doc
  every_page do |page|
    if (block_given? && page.html?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_html_page {|page| ... } ⇒ Object

Pass every HTML page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every HTML page visited.

Yield Parameters:

  • page (Page)

    A visited page.


231
232
233
234
235
# File 'lib/spidr/agent/events.rb', line 231

def every_html_page
  every_page do |page|
    yield page if (block_given? && page.html?)
  end
end

#every_internal_server_error_page {|page| ... } ⇒ Object

Pass every Internal Server Error page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Internal Server Error page visited.

Yield Parameters:

  • page (Page)

    A visited page.


201
202
203
204
205
# File 'lib/spidr/agent/events.rb', line 201

def every_internal_server_error_page
  every_page do |page|
    yield page if (block_given? && page.had_internal_server_error?)
  end
end

#every_javascript_page {|page| ... } ⇒ Object

Pass every JavaScript page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every JavaScript page visited.

Yield Parameters:

  • page (Page)

    A visited page.


406
407
408
409
410
# File 'lib/spidr/agent/events.rb', line 406

def every_javascript_page
  every_page do |page|
    yield page if (block_given? && page.javascript?)
  end
end

Passes every origin and destination URI of each link to a given block.

Yields:

  • (origin, dest)

    The block will be passed every origin and destination URI of each link.

Yield Parameters:

  • origin (URI::HTTP)

    The URI that a link originated from.

  • dest (URI::HTTP)

    The destination URI of a link.


516
517
518
519
# File 'lib/spidr/agent/events.rb', line 516

def every_link(&block)
  @every_link_blocks << block
  return self
end

#every_missing_page {|page| ... } ⇒ Object

Pass every Missing page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Missing page visited.

Yield Parameters:

  • page (Page)

    A visited page.


185
186
187
188
189
# File 'lib/spidr/agent/events.rb', line 185

def every_missing_page
  every_page do |page|
    yield page if (block_given? && page.missing?)
  end
end

#every_ms_word_page {|page| ... } ⇒ Object

Pass every MS Word page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every MS Word page visited.

Yield Parameters:

  • page (Page)

    A visited page.


466
467
468
469
470
# File 'lib/spidr/agent/events.rb', line 466

def every_ms_word_page
  every_page do |page|
    yield page if (block_given? && page.ms_word?)
  end
end

#every_ok_page {|page| ... } ⇒ Object

Pass every OK page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every OK page visited.

Yield Parameters:

  • page (Page)

    A visited page.


95
96
97
98
99
# File 'lib/spidr/agent/events.rb', line 95

def every_ok_page
  every_page do |page|
    yield page if (block_given? && page.ok?)
  end
end

#every_page {|page| ... } ⇒ Object

Pass every page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every page visited.

Yield Parameters:

  • page (Page)

    A visited page.


81
82
83
84
# File 'lib/spidr/agent/events.rb', line 81

def every_page(&block)
  @every_page_blocks << block
  return self
end

#every_pdf_page {|page| ... } ⇒ Object

Pass every PDF page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every PDF page visited.

Yield Parameters:

  • page (Page)

    A visited page.


481
482
483
484
485
# File 'lib/spidr/agent/events.rb', line 481

def every_pdf_page
  every_page do |page|
    yield page if (block_given? && page.pdf?)
  end
end

#every_redirect_page {|page| ... } ⇒ Object

Pass every Redirect page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Redirect page visited.

Yield Parameters:

  • page (Page)

    A visited page.


110
111
112
113
114
# File 'lib/spidr/agent/events.rb', line 110

def every_redirect_page
  every_page do |page|
    yield page if (block_given? && page.redirect?)
  end
end

#every_rss_doc {|doc| ... } ⇒ Object

Pass every RSS document that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every RSS document parsed.

Yield Parameters:

  • doc (Nokogiri::XML::Document)

    A parsed XML document.

See Also:


366
367
368
369
370
371
372
373
374
# File 'lib/spidr/agent/events.rb', line 366

def every_rss_doc
  every_page do |page|
    if (block_given? && page.rss?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_rss_page {|feed| ... } ⇒ Object

Pass every RSS feed that the agent visits to a given block.

Yields:

  • (feed)

    The block will be passed every RSS feed visited.

Yield Parameters:

  • feed (Page)

    A visited page.


436
437
438
439
440
# File 'lib/spidr/agent/events.rb', line 436

def every_rss_page
  every_page do |page|
    yield page if (block_given? && page.rss?)
  end
end

#every_timedout_page {|page| ... } ⇒ Object

Pass every Timeout page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Timeout page visited.

Yield Parameters:

  • page (Page)

    A visited page.


125
126
127
128
129
# File 'lib/spidr/agent/events.rb', line 125

def every_timedout_page
  every_page do |page|
    yield page if (block_given? && page.timedout?)
  end
end

#every_txt_page {|page| ... } ⇒ Object

Pass every Plain Text page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Plain Text page visited.

Yield Parameters:

  • page (Page)

    A visited page.


216
217
218
219
220
# File 'lib/spidr/agent/events.rb', line 216

def every_txt_page
  every_page do |page|
    yield page if (block_given? && page.txt?)
  end
end

#every_unauthorized_page {|page| ... } ⇒ Object

Pass every Unauthorized page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every Unauthorized page visited.

Yield Parameters:

  • page (Page)

    A visited page.


155
156
157
158
159
# File 'lib/spidr/agent/events.rb', line 155

def every_unauthorized_page
  every_page do |page|
    yield page if (block_given? && page.unauthorized?)
  end
end

#every_url {|url| ... } ⇒ Object

Pass each URL from each page visited to the given block.

Yields:

  • (url)

    The block will be passed every URL from every page visited.

Yield Parameters:

  • url (URI::HTTP)

    Each URL from each page visited.


12
13
14
15
# File 'lib/spidr/agent/events.rb', line 12

def every_url(&block)
  @every_url_blocks << block
  return self
end

#every_url_like(pattern) {|url| ... } ⇒ Object

Pass every URL that the agent visits, and matches a given pattern, to a given block.

Yields:

  • (url)

    The block will be passed every URL that matches the given pattern.

Yield Parameters:

  • url (URI::HTTP)

    A matching URL.

Since:

  • 0.3.2


46
47
48
49
# File 'lib/spidr/agent/events.rb', line 46

def every_url_like(pattern,&block)
  @every_url_like_blocks[pattern] << block
  return self
end

#every_xml_doc {|doc| ... } ⇒ Object

Pass every XML document that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every XML document parsed.

Yield Parameters:

  • doc (Nokogiri::XML::Document)

    A parsed XML document.

See Also:


323
324
325
326
327
328
329
330
331
# File 'lib/spidr/agent/events.rb', line 323

def every_xml_doc
  every_page do |page|
    if (block_given? && page.xml?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_xml_page {|page| ... } ⇒ Object

Pass every XML page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every XML page visited.

Yield Parameters:

  • page (Page)

    A visited page.


246
247
248
249
250
# File 'lib/spidr/agent/events.rb', line 246

def every_xml_page
  every_page do |page|
    yield page if (block_given? && page.xml?)
  end
end

#every_xsl_doc {|doc| ... } ⇒ Object

Pass every XML Stylesheet (XSL) that the agent parses to a given block.

Yields:

  • (doc)

    The block will be passed every XSL Stylesheet (XSL) parsed.

Yield Parameters:

  • doc (Nokogiri::XML::Document)

    A parsed XML document.

See Also:


345
346
347
348
349
350
351
352
353
# File 'lib/spidr/agent/events.rb', line 345

def every_xsl_doc
  every_page do |page|
    if (block_given? && page.xsl?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end

#every_xsl_page {|page| ... } ⇒ Object

Pass every XML Stylesheet (XSL) page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every XML Stylesheet (XSL) page visited.

Yield Parameters:

  • page (Page)

    A visited page.


262
263
264
265
266
# File 'lib/spidr/agent/events.rb', line 262

def every_xsl_page
  every_page do |page|
    yield page if (block_given? && page.xsl?)
  end
end

#every_zip_page {|page| ... } ⇒ Object

Pass every ZIP page that the agent visits to a given block.

Yields:

  • (page)

    The block will be passed every ZIP page visited.

Yield Parameters:

  • page (Page)

    A visited page.


496
497
498
499
500
# File 'lib/spidr/agent/events.rb', line 496

def every_zip_page
  every_page do |page|
    yield page if (block_given? && page.zip?)
  end
end

#failed(url) ⇒ Object (protected)

Adds a given URL to the failures list.


839
840
841
842
843
# File 'lib/spidr/agent.rb', line 839

def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
  return true
end

#failed?(url) ⇒ Boolean

Determines whether a given URL could not be visited.


483
484
485
# File 'lib/spidr/agent.rb', line 483

def failed?(url)
  @failures.include?(URI(url))
end

#get_page(url) {|page| ... } ⇒ Page?

Requests and creates a new Page object from a given URL.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.


586
587
588
589
590
591
592
593
594
595
596
597
598
# File 'lib/spidr/agent.rb', line 586

def get_page(url)
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#ignore_extsArray<String, Regexp, Proc>

Specifies the patterns that match URI path extensions to not visit.


328
329
330
# File 'lib/spidr/agent/filters.rb', line 328

def ignore_exts
  @ext_rules.reject
end

#ignore_exts_like(pattern = nil) {|ext| ... } ⇒ Object

Adds a given pattern to the #ignore_exts.

Yields:

  • (ext)

    If a block is given, it will be used to filter URI path extensions.

Yield Parameters:

  • ext (String)

    A URI path extension to reject or accept.


344
345
346
347
348
349
350
351
352
# File 'lib/spidr/agent/filters.rb', line 344

def ignore_exts_like(pattern=nil,&block)
  if pattern
    ignore_exts << pattern
  elsif block
    ignore_exts << block
  end

  return self
end

#ignore_hostsArray<String, Regexp, Proc>

Specifies the patterns that match host-names to not visit.


60
61
62
# File 'lib/spidr/agent/filters.rb', line 60

def ignore_hosts
  @host_rules.reject
end

#ignore_hosts_like(pattern = nil) {|host| ... } ⇒ Object

Adds a given pattern to the #ignore_hosts.

Yields:

  • (host)

    If a block is given, it will be used to filter host-names.

Yield Parameters:

  • host (String)

    A host-name to reject or accept.


76
77
78
79
80
81
82
83
84
# File 'lib/spidr/agent/filters.rb', line 76

def ignore_hosts_like(pattern=nil,&block)
  if pattern
    ignore_hosts << pattern
  elsif block
    ignore_hosts << block
  end

  return self
end

Specifies the patterns that match links to not visit.


192
193
194
# File 'lib/spidr/agent/filters.rb', line 192

def ignore_links
  @link_rules.reject
end

Adds a given pattern to the #ignore_links.

Yields:

  • (link)

    If a block is given, it will be used to filter links.

Yield Parameters:

  • link (String)

    A link to reject or accept.


208
209
210
211
212
213
214
215
216
# File 'lib/spidr/agent/filters.rb', line 208

def ignore_links_like(pattern=nil,&block)
  if pattern
    ignore_links << pattern
  elsif block
    ignore_links << block
  end

  return self
end

#ignore_portsArray<Integer, Regexp, Proc>

Specifies the patterns that match ports to not visit.


124
125
126
# File 'lib/spidr/agent/filters.rb', line 124

def ignore_ports
  @port_rules.reject
end

#ignore_ports_like(pattern = nil) {|port| ... } ⇒ Object

Adds a given pattern to the #ignore_ports.

Yields:

  • (port)

    If a block is given, it will be used to filter ports.

Yield Parameters:

  • port (Integer)

    A port to reject or accept.


140
141
142
143
144
145
146
147
148
# File 'lib/spidr/agent/filters.rb', line 140

def ignore_ports_like(pattern=nil,&block)
  if pattern
    ignore_ports << pattern
  elsif block
    ignore_ports << block
  end

  return self
end

#ignore_urlsArray<String, Regexp, Proc>

Specifies the patterns that match URLs to not visit.

Since:

  • 0.2.4


262
263
264
# File 'lib/spidr/agent/filters.rb', line 262

def ignore_urls
  @url_rules.reject
end

#ignore_urls_like(pattern = nil) {|url| ... } ⇒ Object

Adds a given pattern to the #ignore_urls.

Yields:

  • (url)

    If a block is given, it will be used to filter URLs.

Yield Parameters:

  • url (URI::HTTP, URI::HTTPS)

    A URL to reject or accept.

Since:

  • 0.2.4


280
281
282
283
284
285
286
287
288
# File 'lib/spidr/agent/filters.rb', line 280

def ignore_urls_like(pattern=nil,&block)
  if pattern
    ignore_urls << pattern
  elsif block
    ignore_urls << block
  end

  return self
end

#initialize_actions(options = {}) ⇒ Object (protected)


99
100
101
# File 'lib/spidr/agent/actions.rb', line 99

def initialize_actions(options={})
  @paused = false
end

#initialize_events(options = {}) ⇒ Object (protected)


523
524
525
526
527
528
529
530
# File 'lib/spidr/agent/events.rb', line 523

def initialize_events(options={})
  @every_url_blocks        = []
  @every_failed_url_blocks = []
  @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

  @every_page_blocks = []
  @every_link_blocks = []
end

#initialize_filters(options = {}) ⇒ Object (protected)

Initializes filtering rules.

Options Hash (options):

  • :schemes (Array) — default: ['http', 'https']

    The list of acceptable URI schemes to visit. The https scheme will be ignored if net/https cannot be loaded.

  • :host (String)

    The host-name to visit.

  • :hosts (Array<String, Regexp, Proc>)

    The patterns which match the host-names to visit.

  • :ignore_hosts (Array<String, Regexp, Proc>)

    The patterns which match the host-names to not visit.

  • :ports (Array<Integer, Regexp, Proc>)

    The patterns which match the ports to visit.

  • :ignore_ports (Array<Integer, Regexp, Proc>)

    The patterns which match the ports to not visit.

  • :links (Array<String, Regexp, Proc>)

    The patterns which match the links to visit.

  • :ignore_links (Array<String, Regexp, Proc>)

    The patterns which match the links to not visit.

  • :urls (Array<String, Regexp, Proc>)

    The patterns which match the URLs to visit.

  • :ignore_urls (Array<String, Regexp, Proc>)

    The patterns which match the URLs to not visit.

  • :exts (Array<String, Regexp, Proc>)

    The patterns which match the URI path extensions to visit.

  • :ignore_exts (Array<String, Regexp, Proc>)

    The patterns which match the URI path extensions to not visit.


399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# File 'lib/spidr/agent/filters.rb', line 399

def initialize_filters(options={})
  @schemes = []

  if options[:schemes]
    self.schemes = options[:schemes]
  else
    @schemes << 'http'

    begin
      require 'net/https'

      @schemes << 'https'
    rescue Gem::LoadError => e
      raise(e)
    rescue ::LoadError
      warn "Warning: cannot load 'net/https', https support disabled"
    end
  end

  @host_rules = Rules.new(
    accept: options[:hosts],
    reject: options[:ignore_hosts]
  )
  @port_rules = Rules.new(
    accept: options[:ports],
    reject: options[:ignore_ports]
  )
  @link_rules = Rules.new(
    accept: options[:links],
    reject: options[:ignore_links]
  )
  @url_rules = Rules.new(
    accept: options[:urls],
    reject: options[:ignore_urls]
  )
  @ext_rules = Rules.new(
    accept: options[:exts],
    reject: options[:ignore_exts]
  )

  if options[:host]
    visit_hosts_like(options[:host])
  end
end

#initialize_robotsObject

Initializes the robots filter.


11
12
13
14
15
16
17
# File 'lib/spidr/agent/robots.rb', line 11

def initialize_robots
  unless Object.const_defined?(:Robots)
    raise(ArgumentError,":robots option given but unable to require 'robots' gem")
  end

  @robots = Robots.new(@user_agent)
end

#initialize_sanitizers(options = {}) ⇒ Object (protected)

Initializes the Sanitizer rules.

Options Hash (options):

  • :strip_fragments (Boolean) — default: true

    Specifies whether or not to strip the fragment component from URLs.

  • :strip_query (Boolean) — default: false

    Specifies whether or not to strip the query component from URLs.

Since:

  • 0.2.2


48
49
50
51
# File 'lib/spidr/agent/sanitizers.rb', line 48

def initialize_sanitizers(options={})
  @strip_fragments = options.fetch(:strip_fragments,true)
  @strip_query     = options.fetch(:strip_query,false)
end

#limit_reached?Boolean (protected)

Determines if the maximum limit has been reached.

Since:

  • 0.6.0


809
810
811
# File 'lib/spidr/agent.rb', line 809

def limit_reached?
  @limit && @history.length >= @limit
end

#pause!Object

Pauses the agent, causing spidering to temporarily stop.

Raises:

  • (Paused)

    Indicates to the agent, that it should pause spidering.


61
62
63
64
# File 'lib/spidr/agent/actions.rb', line 61

def pause!
  @paused = true
  raise(Actions::Paused)
end

#pause=(state) ⇒ Object

Sets the pause state of the agent.


51
52
53
# File 'lib/spidr/agent/actions.rb', line 51

def pause=(state)
  @paused = state
end

#paused?Boolean

Determines whether the agent is paused.


72
73
74
# File 'lib/spidr/agent/actions.rb', line 72

def paused?
  @paused == true
end

#post_page(url, post_data = '') {|page| ... } ⇒ Page?

Posts supplied form data and creates a new Page object from a given URL.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.

Since:

  • 0.2.2


621
622
623
624
625
626
627
628
629
630
631
632
633
# File 'lib/spidr/agent.rb', line 621

def post_page(url,post_data='')
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#prepare_request(url) {|request| ... } ⇒ Object (protected)

Normalizes the request path and grabs a session to handle page get and post requests.

Yields:

  • (request)

    A block whose purpose is to make a page request.

Yield Parameters:

  • session (Net::HTTP)

    An HTTP session object.

  • path (String)

    Normalized URL string.

  • headers (Hash)

    A Hash of request header options.

Since:

  • 0.2.2


761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
# File 'lib/spidr/agent.rb', line 761

def prepare_request(url,&block)
  path = unless url.path.empty?
           url.path
         else
           '/'
         end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  headers = prepare_request_headers(url)

  begin
    sleep(@delay) if @delay > 0

    yield @sessions[url], path, headers
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         IOError,
         OpenSSL::SSL::SSLError,
         Net::HTTPBadResponse,
         Zlib::Error

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end

#prepare_request_headers(url) ⇒ Hash{String => String} (protected)

Prepares request headers for the given URL.

Since:

  • 0.6.0


712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
# File 'lib/spidr/agent.rb', line 712

def prepare_request_headers(url)
  # set any additional HTTP headers
  headers = @default_headers.dup

  unless @host_headers.empty?
    @host_headers.each do |name,header|
      if url.host.match(name)
        headers['Host'] = header
        break
      end
    end
  end

  headers['Host']     ||= @host_header if @host_header
  headers['User-Agent'] = @user_agent if @user_agent
  headers['Referer']    = @referer if @referer

  if (authorization = @authorized.for_url(url))
    headers['Authorization'] = "Basic #{authorization}"
  end

  if (header_cookies = @cookies.for_host(url.host))
    headers['Cookie'] = header_cookies
  end

  return headers
end

#proxyProxy

The proxy information the agent uses.

See Also:

Since:

  • 0.2.2


310
311
312
# File 'lib/spidr/agent.rb', line 310

def proxy
  @sessions.proxy
end

#proxy=(new_proxy) ⇒ Hash

Sets the proxy information that the agent uses.

See Also:

Since:

  • 0.2.2


327
328
329
# File 'lib/spidr/agent.rb', line 327

def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end

#queued?(url) ⇒ Boolean

Determines whether a given URL has been enqueued.


520
521
522
# File 'lib/spidr/agent.rb', line 520

def queued?(url)
  @queue.include?(url)
end

#robot_allowed?(url) ⇒ Boolean

Determines whether a URL is allowed by the robot policy.


28
29
30
31
32
33
34
# File 'lib/spidr/agent/robots.rb', line 28

def robot_allowed?(url)
  if @robots
    @robots.allowed?(url)
  else
    true
  end
end

#run {|page| ... } ⇒ Object

Start spidering until the queue becomes empty or the agent is paused.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.


368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/spidr/agent.rb', line 368

def run(&block)
  @running = true

  until (@queue.empty? || paused? || limit_reached?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end

#running?Boolean

Determines if the agent is running.


391
392
393
# File 'lib/spidr/agent.rb', line 391

def running?
  @running == true
end

#sanitize_url(url) ⇒ URI::HTTP, URI::HTTPS

Sanitizes a URL based on filtering options.

Since:

  • 0.2.2


23
24
25
26
27
28
29
30
# File 'lib/spidr/agent/sanitizers.rb', line 23

def sanitize_url(url)
  url = URI(url)

  url.fragment = nil if @strip_fragments
  url.query    = nil if @strip_query

  return url
end

#skip_link!Object

Causes the agent to skip the link being enqueued.

Raises:

  • (SkipLink)

    Indicates to the agent, that the current link should be skipped, and not enqueued or visited.


83
84
85
# File 'lib/spidr/agent/actions.rb', line 83

def skip_link!
  raise(Actions::SkipLink)
end

#skip_page!Object

Causes the agent to skip the page being visited.

Raises:

  • (SkipPage)

    Indicates to the agent, that the current page should be skipped.


93
94
95
# File 'lib/spidr/agent/actions.rb', line 93

def skip_page!
  raise(Actions::SkipPage)
end

#start_at(url) {|page| ... } ⇒ Object

Start spidering at a given URL.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.


353
354
355
356
# File 'lib/spidr/agent.rb', line 353

def start_at(url,&block)
  enqueue(url)
  return run(&block)
end

#to_hashHash

Converts the agent into a Hash.


695
696
697
# File 'lib/spidr/agent.rb', line 695

def to_hash
  {history: @history, queue: @queue}
end

#urls_like(pattern, &block) ⇒ Object

See Also:


54
55
56
# File 'lib/spidr/agent/events.rb', line 54

def urls_like(pattern,&block)
  every_url_like(pattern,&block)
end

#visit?(url) ⇒ Boolean (protected)

Determines if a given URL should be visited.


822
823
824
825
826
827
828
829
830
831
# File 'lib/spidr/agent.rb', line 822

def visit?(url)
  !visited?(url) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_url?(url) &&
   visit_ext?(url.path) &&
   robot_allowed?(url.to_s)
end

#visit_ext?(path) ⇒ Boolean (protected)

Determines if a given URI path extension should be visited.


524
525
526
# File 'lib/spidr/agent/filters.rb', line 524

def visit_ext?(path)
  @ext_rules.accept?(File.extname(path)[1..-1])
end

#visit_extsArray<String, Regexp, Proc>

Specifies the patterns that match the URI path extensions to visit.


296
297
298
# File 'lib/spidr/agent/filters.rb', line 296

def visit_exts
  @ext_rules.accept
end

#visit_exts_like(pattern = nil) {|ext| ... } ⇒ Object

Adds a given pattern to the #visit_exts.

Yields:

  • (ext)

    If a block is given, it will be used to filter URI path extensions.

Yield Parameters:

  • ext (String)

    A URI path extension to accept or reject.


312
313
314
315
316
317
318
319
320
# File 'lib/spidr/agent/filters.rb', line 312

def visit_exts_like(pattern=nil,&block)
  if pattern
    visit_exts << pattern
  elsif block
    visit_exts << block
  end

  return self
end

#visit_host?(host) ⇒ Boolean (protected)

Determines if a given host-name should be visited.


470
471
472
# File 'lib/spidr/agent/filters.rb', line 470

def visit_host?(host)
  @host_rules.accept?(host)
end

#visit_hostsArray<String, Regexp, Proc>

Specifies the patterns that match host-names to visit.


28
29
30
# File 'lib/spidr/agent/filters.rb', line 28

def visit_hosts
  @host_rules.accept
end

#visit_hosts_like(pattern = nil) {|host| ... } ⇒ Object

Adds a given pattern to the #visit_hosts.

Yields:

  • (host)

    If a block is given, it will be used to filter host-names.

Yield Parameters:

  • host (String)

    A host-name to accept or reject.


44
45
46
47
48
49
50
51
52
# File 'lib/spidr/agent/filters.rb', line 44

def visit_hosts_like(pattern=nil,&block)
  if pattern
    visit_hosts << pattern
  elsif block
    visit_hosts << block
  end

  return self
end

#visit_link?(link) ⇒ Boolean (protected)

Determines if a given link should be visited.


496
497
498
# File 'lib/spidr/agent/filters.rb', line 496

def visit_link?(link)
  @link_rules.accept?(link)
end

Specifies the patterns that match the links to visit.

Since:

  • 0.2.4


158
159
160
# File 'lib/spidr/agent/filters.rb', line 158

def visit_links
  @link_rules.accept
end

Adds a given pattern to the #visit_links

Yields:

  • (link)

    If a block is given, it will be used to filter links.

Yield Parameters:

  • link (String)

    A link to accept or reject.

Since:

  • 0.2.4


176
177
178
179
180
181
182
183
184
# File 'lib/spidr/agent/filters.rb', line 176

def visit_links_like(pattern=nil,&block)
  if pattern
    visit_links << pattern
  elsif block
    visit_links << block
  end

  return self
end

#visit_page(url) {|page| ... } ⇒ Page?

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

Yields:

  • (page)

    If a block is given, it will be passed the page which was visited.

Yield Parameters:

  • page (Page)

    The page which was visited.


652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
# File 'lib/spidr/agent.rb', line 652

def visit_page(url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        enqueue(next_url,@levels[url] + 1)
      end
    end
  end
end

#visit_port?(port) ⇒ Boolean (protected)

Determines if a given port should be visited.


483
484
485
# File 'lib/spidr/agent/filters.rb', line 483

def visit_port?(port)
  @port_rules.accept?(port)
end

#visit_portsArray<Integer, Regexp, Proc>

Specifies the patterns that match the ports to visit.


92
93
94
# File 'lib/spidr/agent/filters.rb', line 92

def visit_ports
  @port_rules.accept
end

#visit_ports_like(pattern = nil) {|port| ... } ⇒ Object

Adds a given pattern to the #visit_ports.

Yields:

  • (port)

    If a block is given, it will be used to filter ports.

Yield Parameters:

  • port (Integer)

    A port to accept or reject.


108
109
110
111
112
113
114
115
116
# File 'lib/spidr/agent/filters.rb', line 108

def visit_ports_like(pattern=nil,&block)
  if pattern
    visit_ports << pattern
  elsif block
    visit_ports << block
  end

  return self
end

#visit_scheme?(scheme) ⇒ Boolean (protected)

Determines if a given URI scheme should be visited.


453
454
455
456
457
458
459
# File 'lib/spidr/agent/filters.rb', line 453

def visit_scheme?(scheme)
  if scheme
    @schemes.include?(scheme)
  else
    true
  end
end

#visit_url?(link) ⇒ Boolean (protected)

Determines if a given URL should be visited.

Since:

  • 0.2.4


511
512
513
# File 'lib/spidr/agent/filters.rb', line 511

def visit_url?(link)
  @url_rules.accept?(link)
end

#visit_urlsArray<String, Regexp, Proc>

Specifies the patterns that match the URLs to visit.

Since:

  • 0.2.4


226
227
228
# File 'lib/spidr/agent/filters.rb', line 226

def visit_urls
  @url_rules.accept
end

#visit_urls_like(pattern = nil) {|url| ... } ⇒ Object

Adds a given pattern to the #visit_urls

Yields:

  • (url)

    If a block is given, it will be used to filter URLs.

Yield Parameters:

  • url (URI::HTTP, URI::HTTPS)

    A URL to accept or reject.

Since:

  • 0.2.4


244
245
246
247
248
249
250
251
252
# File 'lib/spidr/agent/filters.rb', line 244

def visit_urls_like(pattern=nil,&block)
  if pattern
    visit_urls << pattern
  elsif block
    visit_urls << block
  end

  return self
end

#visited?(url) ⇒ Boolean

Determines whether a URL was visited or not.


448
449
450
# File 'lib/spidr/agent.rb', line 448

def visited?(url)
  @history.include?(URI(url))
end

#visited_hostsArray<String>

Specifies all hosts that were visited.


435
436
437
# File 'lib/spidr/agent.rb', line 435

def visited_hosts
  visited_urls.map(&:host).uniq
end

Specifies the links which have been visited.


425
426
427
# File 'lib/spidr/agent.rb', line 425

def visited_links
  @history.map(&:to_s)
end