Class: HTMLHierarchyExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/html-hierarchy-extractor.rb

Overview

Extract content from an HTML page in the form of items with associated hierarchy data

Instance Method Summary collapse

Constructor Details

#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor

Returns a new instance of HTMLHierarchyExtractor.



7
8
9
10
11
12
13
# File 'lib/html-hierarchy-extractor.rb', line 7

def initialize(input, options: {})
  @dom = Nokogiri::HTML(input)
  default_options = {
    css_selector: 'p'
  }
  @options = default_options.merge(options)
end

Instance Method Details

#extractObject



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/html-hierarchy-extractor.rb', line 83

def extract
  heading_selector = 'h1,h2,h3,h4,h5,h6'
  # We select all nodes that match either the headings or the elements to
  # extract. This will allow us to loop over it in order it appears in the DOM
  all_selector = "#{heading_selector},#{@options[:css_selector]}"

  items = []
  current_hierarchy = {
    lvl0: nil,
    lvl1: nil,
    lvl2: nil,
    lvl3: nil,
    lvl4: nil,
    lvl5: nil
  }
  current_position = 0 # Position of the DOM node in the tree
  current_lvl = nil # Current closest hierarchy level
  current_anchor = nil # Current closest anchor

  @dom.css(all_selector).each do |node|
    # If it's a heading, we update our current hierarchy
    if node.matches?(heading_selector)
      # Which level heading is it?
      current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
      # Update this level, and set all the following ones to nil
      current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
      (current_lvl + 1..6).each do |lvl|
        current_hierarchy["lvl#{lvl}".to_sym] = nil
      end
      # Update the anchor, if the new heading has one
      new_anchor = extract_anchor(node)
      current_anchor = new_anchor if new_anchor
    end

    # Stop if node is not to be extracted
    next unless node.matches?(@options[:css_selector])

    # Stop if node is empty
    text = extract_text(node)
    next if text.empty?

    item = {
      html: extract_html(node),
      text: text,
      tag_name: extract_tag_name(node),
      hierarchy: current_hierarchy.clone,
      anchor: current_anchor,
      node: node,
      weight: {
        position: current_position,
        heading: heading_weight(current_lvl)
      }
    }
    item[:uuid] = uuid(item)
    items << item

    current_position += 1
  end

  items
end

#extract_anchor(node) ⇒ Object

Returns the anchor to the node

eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor



45
46
47
48
49
50
51
52
53
54
# File 'lib/html-hierarchy-extractor.rb', line 45

def extract_anchor(node)
  anchor = node.attr('name') || node.attr('id') || nil
  return anchor unless anchor.nil?

  # No anchor found directly in the header, search on children
  subelement = node.css('[name],[id]')
  return extract_anchor(subelement[0]) unless subelement.empty?

  nil
end

#extract_html(node) ⇒ Object

Returns the outer HTML of a given node

eg. <p>foo</p> => <p>foo</p>



19
20
21
# File 'lib/html-hierarchy-extractor.rb', line 19

def extract_html(node)
  node.to_s.strip
end

#extract_tag_name(node) ⇒ Object

Returns the tag name of a given node

eg <p>foo</p> => p



35
36
37
# File 'lib/html-hierarchy-extractor.rb', line 35

def extract_tag_name(node)
  node.name.downcase
end

#extract_text(node) ⇒ Object

Returns the inner HTML of a given node

eg. <p>foo</p> => foo



27
28
29
# File 'lib/html-hierarchy-extractor.rb', line 27

def extract_text(node)
  node.content
end

#heading_weight(heading_level) ⇒ Object

Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading



77
78
79
80
81
# File 'lib/html-hierarchy-extractor.rb', line 77

def heading_weight(heading_level)
  weight = 100
  return weight if heading_level.nil?
  weight - ((heading_level + 1) * 10)
end

#uuid(item) ⇒ Object

Generate a unique identifier for the item



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/html-hierarchy-extractor.rb', line 58

def uuid(item)
  # We first get all the keys of the object, sorted alphabetically...
  ordered_keys = item.keys.sort

  # ...then we build a huge array of "key=value" pairs...
  ordered_array = ordered_keys.map do |key|
    value = item[key]
    # We apply the method recursively on other hashes
    value = uuid(value) if value.is_a?(Hash)
    "#{key}=#{value}"
  end

  # ...then we build a unique md5 hash of it
  Digest::MD5.hexdigest(ordered_array.join(','))
end