Class: AlgoliaSearchRecordExtractor
- Inherits:
-
Object
- Object
- AlgoliaSearchRecordExtractor
- Defined in:
- lib/record_extractor.rb
Overview
Given an HTML file as input, will return an array of records to index
Instance Attribute Summary collapse
-
#file ⇒ Object
readonly
Returns the value of attribute file.
Instance Method Summary collapse
-
#custom_hook_all(items) ⇒ Object
Hook to modify all records after extracting.
-
#custom_hook_each(item, _node) ⇒ Object
Hook to modify a record after extracting.
- #extract ⇒ Object
-
#html_nodes ⇒ Object
Get the list of all HTML nodes to index.
-
#initialize(file) ⇒ AlgoliaSearchRecordExtractor
constructor
A new instance of AlgoliaSearchRecordExtractor.
-
#metadata ⇒ Object
Returns metadata from the current file.
-
#node_css_selector(node) ⇒ Object
Returns a hash of two CSS selectors.
-
#node_heading?(node) ⇒ Boolean
Check if node is a heading.
-
#node_heading_parent(node, level = 'h7') ⇒ Object
Get the closest heading parent.
-
#node_hierarchy(node, state = { level: 7 }) ⇒ Object
Get all the parent headings of the specified node If the node itself is a heading, we include it.
-
#node_raw_html(node) ⇒ Object
Return the raw HTML of the element to index.
-
#node_text(node) ⇒ Object
Return the text of the element, sanitized to be displayed.
-
#slug ⇒ Object
Returns the slug of the document.
-
#tags ⇒ Object
Extract a list of tags.
-
#unique_hierarchy(data) ⇒ Object
Returns a unique string of hierarchy from title to h6, used for distinct.
-
#weight(item, index) ⇒ Object
Returns an object of all weights.
-
#weight_heading_relevance(data) ⇒ Object
The more words are in common between this node and its parent heading, the higher the score.
-
#weight_tag_name(item) ⇒ Object
Returns a weight based on the tag_name.
Constructor Details
#initialize(file) ⇒ AlgoliaSearchRecordExtractor
Returns a new instance of AlgoliaSearchRecordExtractor.
9 10 11 12 13 14 15 16 |
# File 'lib/record_extractor.rb', line 9 def initialize(file) @file = file @config = file.site.config default_config = { 'record_css_selector' => 'p' } @config = default_config.merge(file.site.config['algolia']) end |
Instance Attribute Details
#file ⇒ Object (readonly)
Returns the value of attribute file.
7 8 9 |
# File 'lib/record_extractor.rb', line 7 def file @file end |
Instance Method Details
#custom_hook_all(items) ⇒ Object
Hook to modify all records after extracting
24 25 26 |
# File 'lib/record_extractor.rb', line 24 def custom_hook_all(items) items end |
#custom_hook_each(item, _node) ⇒ Object
Hook to modify a record after extracting
19 20 21 |
# File 'lib/record_extractor.rb', line 19 def custom_hook_each(item, _node) item end |
#extract ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
# File 'lib/record_extractor.rb', line 190 def extract items = [] html_nodes.each_with_index do |node, index| next if node.text.empty? item = .clone item.merge!(node_hierarchy(node)) item[:tag_name] = node.name item[:raw_html] = node_raw_html(node) item[:text] = node_text(node) item[:unique_hierarchy] = unique_hierarchy(item) item[:css_selector] = node_css_selector(node) item[:css_selector_parent] = node_css_selector(node_heading_parent(node)) item[:weight] = weight(item, index) # We pass item through the user defined custom hook item = custom_hook_each(item, node) next if item.nil? items << item end custom_hook_all(items) end |
#html_nodes ⇒ Object
Get the list of all HTML nodes to index
75 76 77 78 |
# File 'lib/record_extractor.rb', line 75 def html_nodes document = Nokogiri::HTML(@file.content) document.css(@config['record_css_selector']) end |
#metadata ⇒ Object
Returns metadata from the current file
29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/record_extractor.rb', line 29 def = {} @file.data.each { |key, value| [key.to_sym] = value } [:type] = @file.class.name.split('::')[1].downcase [:url] = @file.url [:slug] = slug [:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date [:tags] = end |
#node_css_selector(node) ⇒ Object
Returns a hash of two CSS selectors. One for the node itself, and one its closest heading parent
146 147 148 149 150 151 152 153 154 |
# File 'lib/record_extractor.rb', line 146 def node_css_selector(node) return nil if node.nil? # Use the CSS id if one is set return "##{node['id']}" if node['id'] # Default Nokogiri selector node.css_path.gsub('html > body > ', '') end |
#node_heading?(node) ⇒ Boolean
Check if node is a heading
81 82 83 |
# File 'lib/record_extractor.rb', line 81 def node_heading?(node) %w(h1 h2 h3 h4 h5 h6).include?(node.name) end |
#node_heading_parent(node, level = 'h7') ⇒ Object
Get the closest heading parent
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/record_extractor.rb', line 86 def node_heading_parent(node, level = 'h7') # If initially called on a heading, we only accept stronger headings level = node.name if level == 'h7' && node_heading?(node) previous = node.previous_element # No previous element, we go up to the parent unless previous parent = node.parent # No more parent, then no heading found return nil if parent.name == 'body' return node_heading_parent(parent, level) end # This is a heading, we return it return previous if node_heading?(previous) && previous.name < level node_heading_parent(previous, level) end |
#node_hierarchy(node, state = { level: 7 }) ⇒ Object
Get all the parent headings of the specified node If the node itself is a heading, we include it
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/record_extractor.rb', line 108 def node_hierarchy(node, state = { level: 7 }) tag_name = node.name level = tag_name.delete('h').to_i if node_heading?(node) && level < state[:level] state[tag_name.to_sym] = node_text(node) state[:level] = level end heading = node_heading_parent(node) # No previous heading, we can stop the recursion unless heading state.delete(:level) return state end node_hierarchy(heading, state) end |
#node_raw_html(node) ⇒ Object
Return the raw HTML of the element to index
129 130 131 |
# File 'lib/record_extractor.rb', line 129 def node_raw_html(node) node.to_s end |
#node_text(node) ⇒ Object
Return the text of the element, sanitized to be displayed
134 135 136 |
# File 'lib/record_extractor.rb', line 134 def node_text(node) node.content.gsub('<', '<').gsub('>', '>') end |
#slug ⇒ Object
Returns the slug of the document
45 46 47 48 49 50 51 52 53 54 |
# File 'lib/record_extractor.rb', line 45 def slug # Jekyll v3 has it in data return @file.data['slug'] if @file.data.key?('slug') # Old Jekyll v2 has it at the root return @file.slug if @file.respond_to? :slug # Otherwise, we guess it from the filename basename = File.basename(@file.path) extname = File.extname(basename) File.basename(basename, extname) end |
#tags ⇒ Object
Extract a list of tags
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/record_extractor.rb', line 57 def = nil # Jekyll v3 has it in data, while v2 have it at the root if @file.data.key?('tags') = @file.data['tags'] elsif @file.respond_to? :tags = @file. end return if .nil? # Anyway, we force cast it to string as some plugins will extend the tags to # full featured objects .map(&:to_s) end |
#unique_hierarchy(data) ⇒ Object
Returns a unique string of hierarchy from title to h6, used for distinct
139 140 141 142 |
# File 'lib/record_extractor.rb', line 139 def unique_hierarchy(data) headings = %w(title h1 h2 h3 h4 h5 h6) headings.map { |heading| data[heading.to_sym] }.compact.join(' > ') end |
#weight(item, index) ⇒ Object
Returns an object of all weights
182 183 184 185 186 187 188 |
# File 'lib/record_extractor.rb', line 182 def weight(item, index) { tag_name: weight_tag_name(item), heading_relevance: weight_heading_relevance(item), position: index } end |
#weight_heading_relevance(data) ⇒ Object
The more words are in common between this node and its parent heading, the higher the score
158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/record_extractor.rb', line 158 def weight_heading_relevance(data) # Get list of unique words in headings title_words = %i(title h1 h2 h3 h4 h5 h6) .select { |title| data.key?(title) } .map { |title| data[title].to_s.split(/\W+/) } .flatten .compact .map(&:downcase) .uniq # Intersect words in headings with words in test text_words = data[:text].downcase.split(/\W+/) (title_words & text_words).size end |
#weight_tag_name(item) ⇒ Object
Returns a weight based on the tag_name
173 174 175 176 177 178 179 |
# File 'lib/record_extractor.rb', line 173 def weight_tag_name(item) tag_name = item[:tag_name] # No a heading, no weight return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name) # h1: 100, h2: 90, ..., h6: 50 100 - (tag_name.delete('h').to_i - 1) * 10 end |