Class: HTMLHierarchyExtractor
- Inherits:
-
Object
- Object
- HTMLHierarchyExtractor
- Defined in:
- lib/html-hierarchy-extractor.rb
Overview
Extract content from an HTML page in the form of items with associated hierarchy data
Instance Method Summary collapse
- #extract ⇒ Object
-
#extract_anchor(node) ⇒ Object
Returns the anchor to the node.
-
#extract_html(node) ⇒ Object
Returns the outer HTML of a given node.
-
#extract_tag_name(node) ⇒ Object
Returns the tag name of a given node.
-
#extract_text(node) ⇒ Object
Returns the inner HTML of a given node.
-
#heading_weight(heading_level) ⇒ Object
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading.
-
#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor
constructor
A new instance of HTMLHierarchyExtractor.
-
#uuid(item) ⇒ Object
Generate a unique identifier for the item.
Constructor Details
#initialize(input, options: {}) ⇒ HTMLHierarchyExtractor
Returns a new instance of HTMLHierarchyExtractor.
7 8 9 10 11 12 13 |
# File 'lib/html-hierarchy-extractor.rb', line 7 def initialize(input, options: {}) @dom = Nokogiri::HTML(input) = { css_selector: 'p' } @options = .merge() end |
Instance Method Details
#extract ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/html-hierarchy-extractor.rb', line 83 def extract heading_selector = 'h1,h2,h3,h4,h5,h6' # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM all_selector = "#{heading_selector},#{@options[:css_selector]}" items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest hierarchy level current_anchor = nil # Current closest anchor @dom.css(all_selector).each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(@options[:css_selector]) # Stop if node is empty text = extract_text(node) next if text.empty? item = { html: extract_html(node), text: text, tag_name: extract_tag_name(node), hierarchy: current_hierarchy.clone, anchor: current_anchor, node: node, weight: { position: current_position, heading: heading_weight(current_lvl) } } item[:uuid] = uuid(item) items << item current_position += 1 end items end |
#extract_anchor(node) ⇒ Object
Returns the anchor to the node
eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor
45 46 47 48 49 50 51 52 53 54 |
# File 'lib/html-hierarchy-extractor.rb', line 45 def extract_anchor(node) anchor = node.attr('name') || node.attr('id') || nil return anchor unless anchor.nil? # No anchor found directly in the header, search on children subelement = node.css('[name],[id]') return extract_anchor(subelement[0]) unless subelement.empty? nil end |
#extract_html(node) ⇒ Object
Returns the outer HTML of a given node
eg. <p>foo</p> => <p>foo</p>
19 20 21 |
# File 'lib/html-hierarchy-extractor.rb', line 19 def extract_html(node) node.to_s.strip end |
#extract_tag_name(node) ⇒ Object
Returns the tag name of a given node
eg <p>foo</p> => p
35 36 37 |
# File 'lib/html-hierarchy-extractor.rb', line 35 def extract_tag_name(node) node.name.downcase end |
#extract_text(node) ⇒ Object
Returns the inner HTML of a given node
eg. <p>foo</p> => foo
27 28 29 |
# File 'lib/html-hierarchy-extractor.rb', line 27 def extract_text(node) node.content end |
#heading_weight(heading_level) ⇒ Object
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading
77 78 79 80 81 |
# File 'lib/html-hierarchy-extractor.rb', line 77 def heading_weight(heading_level) weight = 100 return weight if heading_level.nil? weight - ((heading_level + 1) * 10) end |
#uuid(item) ⇒ Object
Generate a unique identifier for the item
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/html-hierarchy-extractor.rb', line 58 def uuid(item) # We first get all the keys of the object, sorted alphabetically... ordered_keys = item.keys.sort # ...then we build a huge array of "key=value" pairs... ordered_array = ordered_keys.map do |key| value = item[key] # We apply the method recursively on other hashes value = uuid(value) if value.is_a?(Hash) "#{key}=#{value}" end # ...then we build a unique md5 hash of it Digest::MD5.hexdigest(ordered_array.join(',')) end |