Class: LoaderRuby::Loaders::Web

Inherits:
Base
  • Object
show all
Includes:
EncodingDetector, HtmlExtractor
Defined in:
lib/loader_ruby/loaders/web.rb

Constant Summary collapse

DEFAULT_MAX_REDIRECTS =
5

Constants included from EncodingDetector

EncodingDetector::BOM_MAP

Constants included from HtmlExtractor

HtmlExtractor::REMOVE_SELECTORS

Instance Method Summary collapse

Instance Method Details

#crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/loader_ruby/loaders/web.rb', line 38

def crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS)
  visited = Set.new
  queue = [start_url]
  documents = []

  while queue.any? && documents.size < max_pages
    url = queue.shift
    next if visited.include?(url)

    visited << url

    begin
      doc = load(url, max_redirects: max_redirects)
      documents << doc
    rescue StandardError
      next
    end
  end

  documents
end

#load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/loader_ruby/loaders/web.rb', line 15

def load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts)
  validate_url!(url)
  require_nokogiri!

  html, content_type = fetch(url, max_redirects: max_redirects)

  detected = detect_encoding_from_content_type(content_type) ||
             detect_encoding_from_bom(html.b)
  html = transcode_to_utf8(html, detected) if detected

  doc = parse_html(html)
  title = extract_title(doc)
  content = extract_text(doc)

  Document.new(
    content: content,
    metadata: (url,
      format: :web,
      title: title
    )
  )
end