Class: RubyCrawl

Inherits:
Object
  • Object
show all
Includes:
Helpers
Defined in:
lib/rubycrawl.rb,
lib/rubycrawl/errors.rb,
lib/rubycrawl/result.rb,
lib/rubycrawl/helpers.rb,
lib/rubycrawl/railtie.rb,
lib/rubycrawl/version.rb,
lib/rubycrawl/site_crawler.rb,
lib/rubycrawl/service_client.rb,
lib/rubycrawl/url_normalizer.rb,
lib/rubycrawl/markdown_converter.rb

Overview

RubyCrawl provides a simple interface for crawling pages via a local Playwright service.

Defined Under Namespace

Modules: Helpers, MarkdownConverter, UrlNormalizer Classes: ConfigurationError, Error, NavigationError, Railtie, Result, ServiceClient, ServiceError, SiteCrawler, TimeoutError

Constant Summary collapse

DEFAULT_HOST =
'127.0.0.1'
DEFAULT_PORT =
3344
VERSION =
'0.1.3'

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(**options) ⇒ RubyCrawl



53
54
55
56
# File 'lib/rubycrawl.rb', line 53

def initialize(**options)
  load_options(options)
  build_service_client
end

Class Method Details

.clientObject



21
22
23
# File 'lib/rubycrawl.rb', line 21

def client
  @client ||= new
end

.configure(**options) ⇒ Object



48
49
50
# File 'lib/rubycrawl.rb', line 48

def configure(**options)
  @client = new(**options)
end

.crawl(url, **options) ⇒ Object



25
26
27
# File 'lib/rubycrawl.rb', line 25

def crawl(url, **options)
  client.crawl(url, **options)
end

.crawl_site(url) {|page| ... } ⇒ Integer

Crawl multiple pages starting from a URL, following links. Yields each page result to the block as it is crawled.

Examples:

Save pages to database

RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
  Page.create!(url: page.url, html: page.html, depth: page.depth)
end

Yields:

  • (page)

    Yields each page result as it is crawled

Yield Parameters:



44
45
46
# File 'lib/rubycrawl.rb', line 44

def crawl_site(url, ...)
  client.crawl_site(url, ...)
end

Instance Method Details

#crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries) ⇒ Object



58
59
60
61
62
63
64
65
66
67
# File 'lib/rubycrawl.rb', line 58

def crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries)
  validate_url!(url)
  @service_client.ensure_running
  with_retries(retries) do
    payload = build_payload(url, wait_until, block_resources)
    response = @service_client.post_json('/crawl', payload)
    raise_node_error!(response)
    build_result(response)
  end
end

#crawl_site(url, **options, &block) ⇒ Object

Crawl multiple pages starting from a URL, following links.

See Also:



71
72
73
74
75
76
# File 'lib/rubycrawl.rb', line 71

def crawl_site(url, **options, &block)
  @service_client.ensure_running
  crawler_options = build_crawler_options(options)
  crawler = SiteCrawler.new(self, crawler_options)
  crawler.crawl(url, &block)
end