Class: RubyCrawl::SiteCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/rubycrawl/site_crawler.rb

Overview

BFS crawler that follows links with deduplication.

Defined Under Namespace

Classes: PageResult

Instance Method Summary collapse

Constructor Details

#initialize(client, options = {}) ⇒ SiteCrawler

Returns a new instance of SiteCrawler.



32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/rubycrawl/site_crawler.rb', line 32

def initialize(client, options = {})
  @client = client
  @max_pages = options.fetch(:max_pages, 50)
  @max_depth = options.fetch(:max_depth, 3)
  @same_host_only = options.fetch(:same_host_only, true)
  @wait_until = options.fetch(:wait_until, nil)
  @block_resources = options.fetch(:block_resources, nil)
  @max_attempts = options.fetch(:max_attempts, nil)
  @visited = Set.new
  @queue = []
  @session_id = nil
end

Instance Method Details

#crawl(start_url, &block) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/rubycrawl/site_crawler.rb', line 45

def crawl(start_url, &block)
  raise ArgumentError, 'Block required for site crawl' unless block_given?

  normalized = UrlNormalizer.normalize(start_url)
  raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized

  @base_url = normalized
  @session_id = @client.create_session
  enqueue(normalized, 0)
  process_queue(&block)
ensure
  @client.destroy_session(@session_id) if @session_id
end