Class: HTMLHierarchyExtractor
- Inherits:
-
Object
- Object
- HTMLHierarchyExtractor
- Defined in:
- lib/html-hierarchy-extractor.rb
Overview
Extract content from an HTML page in the form of items with associated hierarchy data
Instance Method Summary collapse
- #extract ⇒ Object
-
#extract_anchor(node) ⇒ Object
Returns the anchor to the node.
-
#extract_html(node) ⇒ Object
Returns the outer HTML of a given node.
-
#extract_tag_name(node) ⇒ Object
Returns the tag name of a given node.
-
#extract_text(node) ⇒ Object
Returns the inner HTML of a given node.
-
#heading_weight(heading_level) ⇒ Object
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading.
-
#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor
constructor
A new instance of HTMLHierarchyExtractor.
-
#uuid(item) ⇒ Object
Generate a unique identifier for the item.
Constructor Details
#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor
Returns a new instance of HTMLHierarchyExtractor.
7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/html-hierarchy-extractor.rb', line 7 def initialize(input, options: {}) @dom = Nokogiri::HTML(input) = { css_selector: 'p' } @options = .merge() warn '[DEPRECATION] The gem html-hierarchy-extractor has been renamed '\ 'to algolia_html_extractor and will no longer be supported. '\ 'Please switch to algolia_html_extractor as soon as possible.' end |
Instance Method Details
#extract ⇒ Object
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/html-hierarchy-extractor.rb', line 87 def extract heading_selector = 'h1,h2,h3,h4,h5,h6' # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM all_selector = "#{heading_selector},#{@options[:css_selector]}" items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest hierarchy level current_anchor = nil # Current closest anchor @dom.css(all_selector).each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(@options[:css_selector]) # Stop if node is empty text = extract_text(node) next if text.empty? item = { html: extract_html(node), text: text, tag_name: extract_tag_name(node), hierarchy: current_hierarchy.clone, anchor: current_anchor, node: node, weight: { position: current_position, heading: heading_weight(current_lvl) } } item[:uuid] = uuid(item) items << item current_position += 1 end items end |
#extract_anchor(node) ⇒ Object
Returns the anchor to the node
eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor
49 50 51 52 53 54 55 56 57 58 |
# File 'lib/html-hierarchy-extractor.rb', line 49 def extract_anchor(node) anchor = node.attr('name') || node.attr('id') || nil return anchor unless anchor.nil? # No anchor found directly in the header, search on children subelement = node.css('[name],[id]') return extract_anchor(subelement[0]) unless subelement.empty? nil end |
#extract_html(node) ⇒ Object
Returns the outer HTML of a given node
eg. <p>foo</p> => <p>foo</p>
23 24 25 |
# File 'lib/html-hierarchy-extractor.rb', line 23 def extract_html(node) node.to_s.strip end |
#extract_tag_name(node) ⇒ Object
Returns the tag name of a given node
eg <p>foo</p> => p
39 40 41 |
# File 'lib/html-hierarchy-extractor.rb', line 39 def extract_tag_name(node) node.name.downcase end |
#extract_text(node) ⇒ Object
Returns the inner HTML of a given node
eg. <p>foo</p> => foo
31 32 33 |
# File 'lib/html-hierarchy-extractor.rb', line 31 def extract_text(node) node.content end |
#heading_weight(heading_level) ⇒ Object
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading
81 82 83 84 85 |
# File 'lib/html-hierarchy-extractor.rb', line 81 def heading_weight(heading_level) weight = 100 return weight if heading_level.nil? weight - ((heading_level + 1) * 10) end |
#uuid(item) ⇒ Object
Generate a unique identifier for the item
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/html-hierarchy-extractor.rb', line 62 def uuid(item) # We first get all the keys of the object, sorted alphabetically... ordered_keys = item.keys.sort # ...then we build a huge array of "key=value" pairs... ordered_array = ordered_keys.map do |key| value = item[key] # We apply the method recursively on other hashes value = uuid(value) if value.is_a?(Hash) "#{key}=#{value}" end # ...then we build a unique md5 hash of it Digest::MD5.hexdigest(ordered_array.join(',')) end |