Class: LoaderRuby::Loaders::Web
Constant Summary
collapse
- DEFAULT_MAX_REDIRECTS =
5
EncodingDetector::BOM_MAP
HtmlExtractor::REMOVE_SELECTORS
Instance Method Summary
collapse
Instance Method Details
#crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS) ⇒ Object
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
# File 'lib/loader_ruby/loaders/web.rb', line 38
def crawl(start_url, max_pages: 10, max_redirects: DEFAULT_MAX_REDIRECTS)
visited = Set.new
queue = [start_url]
documents = []
while queue.any? && documents.size < max_pages
url = queue.shift
next if visited.include?(url)
visited << url
begin
doc = load(url, max_redirects: max_redirects)
documents << doc
rescue StandardError
next
end
end
documents
end
|
#load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts) ⇒ Object
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/loader_ruby/loaders/web.rb', line 15
def load(url, max_redirects: DEFAULT_MAX_REDIRECTS, **opts)
validate_url!(url)
require_nokogiri!
html, content_type = fetch(url, max_redirects: max_redirects)
detected = detect_encoding_from_content_type(content_type) ||
detect_encoding_from_bom(html.b)
html = transcode_to_utf8(html, detected) if detected
doc = parse_html(html)
title = (doc)
content = (doc)
Document.new(
content: content,
metadata: build_metadata(url,
format: :web,
title: title
)
)
end
|