Class: RubyCrawl::SiteCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/rubycrawl/site_crawler.rb

Overview

BFS crawler that follows links with deduplication.

Defined Under Namespace

Classes: PageResult

Instance Method Summary collapse

Constructor Details

#initialize(client, options = {}) ⇒ SiteCrawler

Returns a new instance of SiteCrawler.



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/rubycrawl/site_crawler.rb', line 43

def initialize(client, options = {})
  @client = client
  @max_pages = options.fetch(:max_pages, 50)
  @max_depth = options.fetch(:max_depth, 3)
  @same_host_only = options.fetch(:same_host_only, true)
  @wait_until = options.fetch(:wait_until, nil)
  @block_resources = options.fetch(:block_resources, nil)
  @max_attempts        = options.fetch(:max_attempts, nil)
  @respect_robots_txt  = options.fetch(:respect_robots_txt, false)
  @visited = Set.new
  @queue = []
end

Instance Method Details

#crawl(start_url, &block) ⇒ Object

Raises:

  • (ArgumentError)


56
57
58
59
60
61
62
63
64
65
66
# File 'lib/rubycrawl/site_crawler.rb', line 56

def crawl(start_url, &block)
  raise ArgumentError, 'Block required for site crawl' unless block_given?

  normalized = UrlNormalizer.normalize(start_url)
  raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized

  @base_url = normalized
  @robots   = @respect_robots_txt ? RobotsParser.fetch(@base_url) : nil
  enqueue(normalized, 0)
  process_queue(&block)
end