Module: AlgoliaHTMLExtractor

Defined in:
lib/algolia_html_extractor.rb

Overview

Extract content from an HTML page in the form of items with associated headings data

Class Method Summary collapse

Class Method Details

.css(input, selector) ⇒ Object

Getting a list of HTML nodes from an input and a CSS selector



17
18
19
# File 'lib/algolia_html_extractor.rb', line 17

def self.css(input, selector)
  Nokogiri::HTML(input).css(selector)
end

.default_options(options) ⇒ Object

Extractor options, applying default options when none set



8
9
10
11
12
13
14
# File 'lib/algolia_html_extractor.rb', line 8

def self.default_options(options)
  default_options = {
    css_selector: 'p',
    heading_selector: 'h1,h2,h3,h4,h5,h6'
  }
  default_options.merge(options)
end

.extract_anchor(node) ⇒ Object

Returns the anchor to the node

eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor



113
114
115
116
117
118
119
120
121
122
# File 'lib/algolia_html_extractor.rb', line 113

def self.extract_anchor(node)
  anchor = node.attr('name') || node.attr('id') || nil
  return anchor unless anchor.nil?

  # No anchor found directly in the header, search on children
  subelement = node.css('[name],[id]')
  return extract_anchor(subelement[0]) unless subelement.empty?

  nil
end

.extract_html(node) ⇒ Object

Returns the outer HTML of a given node

eg. <p>foo</p> => <p>foo</p>



87
88
89
# File 'lib/algolia_html_extractor.rb', line 87

def self.extract_html(node)
  node.to_s.strip
end

.extract_tag_name(node) ⇒ Object

Returns the tag name of a given node

eg <p>foo</p> => p



103
104
105
# File 'lib/algolia_html_extractor.rb', line 103

def self.extract_tag_name(node)
  node.name.downcase
end

.extract_text(node) ⇒ Object

Returns the inner HTML of a given node

eg. <p>foo</p> => foo



95
96
97
# File 'lib/algolia_html_extractor.rb', line 95

def self.extract_text(node)
  node.content
end

.heading_weight(heading_level) ⇒ Object

Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading



148
149
150
151
152
# File 'lib/algolia_html_extractor.rb', line 148

def self.heading_weight(heading_level)
  weight = 100
  return weight if heading_level.nil?
  weight - ((heading_level + 1) * 10)
end

.run(input, options: {}) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/algolia_html_extractor.rb', line 21

def self.run(input, options: {})
  options = default_options(options)
  heading_selector = options[:heading_selector]
  css_selector = options[:css_selector]

  items = []
  current_hierarchy = {
    lvl0: nil,
    lvl1: nil,
    lvl2: nil,
    lvl3: nil,
    lvl4: nil,
    lvl5: nil
  }
  current_position = 0 # Position of the DOM node in the tree
  current_lvl = nil # Current closest headings level
  current_anchor = nil # Current closest anchor

  # We select all nodes that match either the headings or the elements to
  # extract. This will allow us to loop over it in order it appears in the DOM
  css(input, "#{heading_selector},#{css_selector}").each do |node|
    # If it's a heading, we update our current hierarchy
    if node.matches?(heading_selector)
      # Which level heading is it?
      current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
      # Update this level, and set all the following ones to nil
      current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
      (current_lvl + 1..6).each do |lvl|
        current_hierarchy["lvl#{lvl}".to_sym] = nil
      end
      # Update the anchor, if the new heading has one
      new_anchor = extract_anchor(node)
      current_anchor = new_anchor if new_anchor
    end

    # Stop if node is not to be extracted
    next unless node.matches?(css_selector)

    # Stop if node is empty
    content = extract_text(node)
    next if content.empty?

    item = {
      html: extract_html(node),
      content: content,
      headings: current_hierarchy.values.compact,
      anchor: current_anchor,
      node: node,
      custom_ranking: {
        position: current_position,
        heading: heading_weight(current_lvl)
      }
    }
    item[:objectID] = uuid(item)
    items << item

    current_position += 1
  end

  items
end

.uuid(item) ⇒ Object

Generate a unique identifier for the item



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/algolia_html_extractor.rb', line 126

def self.uuid(item)
  # We don't use the objectID as part of the hash algorithm

  item.delete(:objectID)
  # We first get all the keys of the object, sorted alphabetically...
  ordered_keys = item.keys.sort

  # ...then we build a huge array of "key=value" pairs...
  ordered_array = ordered_keys.map do |key|
    value = item[key]
    # We apply the method recursively on other hashes
    value = uuid(value) if value.is_a?(Hash)
    "#{key}=#{value}"
  end

  # ...then we build a unique md5 hash of it
  Digest::MD5.hexdigest(ordered_array.join(','))
end