Class: HTMLHierarchyExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/html-hierarchy-extractor.rb

Overview

Extract content from an HTML page in the form of items with associated hierarchy data

Instance Method Summary collapse

Constructor Details

#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor

Returns a new instance of HTMLHierarchyExtractor.



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/html-hierarchy-extractor.rb', line 7

def initialize(input, options: {})
  @dom = Nokogiri::HTML(input)
  default_options = {
    css_selector: 'p'
  }
  @options = default_options.merge(options)

  warn '[DEPRECATION] The gem html-hierarchy-extractor has been renamed '\
       'to algolia_html_extractor and will no longer be supported. '\
       'Please switch to algolia_html_extractor as soon as possible.'
end

Instance Method Details

#extractObject



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/html-hierarchy-extractor.rb', line 87

def extract
  heading_selector = 'h1,h2,h3,h4,h5,h6'
  # We select all nodes that match either the headings or the elements to
  # extract. This will allow us to loop over it in order it appears in the DOM
  all_selector = "#{heading_selector},#{@options[:css_selector]}"

  items = []
  current_hierarchy = {
    lvl0: nil,
    lvl1: nil,
    lvl2: nil,
    lvl3: nil,
    lvl4: nil,
    lvl5: nil
  }
  current_position = 0 # Position of the DOM node in the tree
  current_lvl = nil # Current closest hierarchy level
  current_anchor = nil # Current closest anchor

  @dom.css(all_selector).each do |node|
    # If it's a heading, we update our current hierarchy
    if node.matches?(heading_selector)
      # Which level heading is it?
      current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
      # Update this level, and set all the following ones to nil
      current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
      (current_lvl + 1..6).each do |lvl|
        current_hierarchy["lvl#{lvl}".to_sym] = nil
      end
      # Update the anchor, if the new heading has one
      new_anchor = extract_anchor(node)
      current_anchor = new_anchor if new_anchor
    end

    # Stop if node is not to be extracted
    next unless node.matches?(@options[:css_selector])

    # Stop if node is empty
    text = extract_text(node)
    next if text.empty?

    item = {
      html: extract_html(node),
      text: text,
      tag_name: extract_tag_name(node),
      hierarchy: current_hierarchy.clone,
      anchor: current_anchor,
      node: node,
      weight: {
        position: current_position,
        heading: heading_weight(current_lvl)
      }
    }
    item[:uuid] = uuid(item)
    items << item

    current_position += 1
  end

  items
end

#extract_anchor(node) ⇒ Object

Returns the anchor to the node

eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor



49
50
51
52
53
54
55
56
57
58
# File 'lib/html-hierarchy-extractor.rb', line 49

def extract_anchor(node)
  anchor = node.attr('name') || node.attr('id') || nil
  return anchor unless anchor.nil?

  # No anchor found directly in the header, search on children
  subelement = node.css('[name],[id]')
  return extract_anchor(subelement[0]) unless subelement.empty?

  nil
end

#extract_html(node) ⇒ Object

Returns the outer HTML of a given node

eg. <p>foo</p> => <p>foo</p>



23
24
25
# File 'lib/html-hierarchy-extractor.rb', line 23

def extract_html(node)
  node.to_s.strip
end

#extract_tag_name(node) ⇒ Object

Returns the tag name of a given node

eg <p>foo</p> => p



39
40
41
# File 'lib/html-hierarchy-extractor.rb', line 39

def extract_tag_name(node)
  node.name.downcase
end

#extract_text(node) ⇒ Object

Returns the inner HTML of a given node

eg. <p>foo</p> => foo



31
32
33
# File 'lib/html-hierarchy-extractor.rb', line 31

def extract_text(node)
  node.content
end

#heading_weight(heading_level) ⇒ Object

Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading



81
82
83
84
85
# File 'lib/html-hierarchy-extractor.rb', line 81

def heading_weight(heading_level)
  weight = 100
  return weight if heading_level.nil?
  weight - ((heading_level + 1) * 10)
end

#uuid(item) ⇒ Object

Generate a unique identifier for the item



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/html-hierarchy-extractor.rb', line 62

def uuid(item)
  # We first get all the keys of the object, sorted alphabetically...
  ordered_keys = item.keys.sort

  # ...then we build a huge array of "key=value" pairs...
  ordered_array = ordered_keys.map do |key|
    value = item[key]
    # We apply the method recursively on other hashes
    value = uuid(value) if value.is_a?(Hash)
    "#{key}=#{value}"
  end

  # ...then we build a unique md5 hash of it
  Digest::MD5.hexdigest(ordered_array.join(','))
end