Class: LoaderRuby::Loaders::Html

Inherits:
Base
  • Object
show all
Includes:
EncodingDetector, HtmlExtractor
Defined in:
lib/loader_ruby/loaders/html.rb

Constant Summary collapse

EXTENSIONS =
%w[.html .htm].freeze

Constants included from EncodingDetector

EncodingDetector::BOM_MAP

Constants included from HtmlExtractor

HtmlExtractor::REMOVE_SELECTORS

Instance Method Summary collapse

Instance Method Details

#load(path, **opts) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/loader_ruby/loaders/html.rb', line 11

def load(path, **opts)
  check_file_exists!(path)
  check_file_size!(path)
  require_nokogiri!

  raw = File.binread(path)
  detected = detect_encoding_from_bom(raw)
  html = transcode_to_utf8(raw, detected || "UTF-8")

  doc = parse_html(html)
  title = extract_title(doc)
  content = extract_text(doc)

  Document.new(
    content: content,
    metadata: (path,
      format: :html,
      title: title
    )
  )
end