Class: AlgoliaSearchRecordExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/record_extractor.rb

Overview

Given an HTML file as input, will return an array of records to index

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file) ⇒ AlgoliaSearchRecordExtractor

Returns a new instance of AlgoliaSearchRecordExtractor.



9
10
11
12
13
14
15
16
# File 'lib/record_extractor.rb', line 9

def initialize(file)
  @file = file
  @config = file.site.config
  default_config = {
    'record_css_selector' => 'p'
  }
  @config = default_config.merge(file.site.config['algolia'])
end

Instance Attribute Details

#fileObject (readonly)

Returns the value of attribute file.



7
8
9
# File 'lib/record_extractor.rb', line 7

def file
  @file
end

Instance Method Details

#custom_hook_all(items) ⇒ Object

Hook to modify all records after extracting



24
25
26
# File 'lib/record_extractor.rb', line 24

def custom_hook_all(items)
  items
end

#custom_hook_each(item, _node) ⇒ Object

Hook to modify a record after extracting



19
20
21
# File 'lib/record_extractor.rb', line 19

def custom_hook_each(item, _node)
  item
end

#extractObject



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/record_extractor.rb', line 190

def extract
  items = []
  html_nodes.each_with_index do |node, index|
    next if node.text.empty?

    item = .clone
    item.merge!(node_hierarchy(node))
    item[:tag_name] = node.name
    item[:raw_html] = node_raw_html(node)
    item[:text] = node_text(node)
    item[:unique_hierarchy] = unique_hierarchy(item)
    item[:css_selector] = node_css_selector(node)
    item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
    item[:weight] = weight(item, index)

    # We pass item through the user defined custom hook
    item = custom_hook_each(item, node)
    next if item.nil?

    items << item
  end
  custom_hook_all(items)
end

#html_nodesObject

Get the list of all HTML nodes to index



75
76
77
78
# File 'lib/record_extractor.rb', line 75

def html_nodes
  document = Nokogiri::HTML(@file.content)
  document.css(@config['record_css_selector'])
end

#metadataObject

Returns metadata from the current file



29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/record_extractor.rb', line 29

def 
   = {}
  @file.data.each { |key, value| [key.to_sym] = value }

  [:type] = @file.class.name.split('::')[1].downcase
  [:url] = @file.url

  [:slug] = slug

  [:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date
  [:tags] = tags

  
end

#node_css_selector(node) ⇒ Object

Returns a hash of two CSS selectors. One for the node itself, and one its closest heading parent



146
147
148
149
150
151
152
153
154
# File 'lib/record_extractor.rb', line 146

def node_css_selector(node)
  return nil if node.nil?

  # Use the CSS id if one is set
  return "##{node['id']}" if node['id']

  # Default Nokogiri selector
  node.css_path.gsub('html > body > ', '')
end

#node_heading?(node) ⇒ Boolean

Check if node is a heading

Returns:

  • (Boolean)


81
82
83
# File 'lib/record_extractor.rb', line 81

def node_heading?(node)
  %w(h1 h2 h3 h4 h5 h6).include?(node.name)
end

#node_heading_parent(node, level = 'h7') ⇒ Object

Get the closest heading parent



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/record_extractor.rb', line 86

def node_heading_parent(node, level = 'h7')
  # If initially called on a heading, we only accept stronger headings
  level = node.name if level == 'h7' && node_heading?(node)

  previous = node.previous_element

  # No previous element, we go up to the parent
  unless previous
    parent = node.parent
    # No more parent, then no heading found
    return nil if parent.name == 'body'
    return node_heading_parent(parent, level)
  end

  # This is a heading, we return it
  return previous if node_heading?(previous) && previous.name < level

  node_heading_parent(previous, level)
end

#node_hierarchy(node, state = { level: 7 }) ⇒ Object

Get all the parent headings of the specified node If the node itself is a heading, we include it



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/record_extractor.rb', line 108

def node_hierarchy(node, state = { level: 7 })
  tag_name = node.name
  level = tag_name.delete('h').to_i

  if node_heading?(node) && level < state[:level]
    state[tag_name.to_sym] = node_text(node)
    state[:level] = level
  end

  heading = node_heading_parent(node)

  # No previous heading, we can stop the recursion
  unless heading
    state.delete(:level)
    return state
  end

  node_hierarchy(heading, state)
end

#node_raw_html(node) ⇒ Object

Return the raw HTML of the element to index



129
130
131
# File 'lib/record_extractor.rb', line 129

def node_raw_html(node)
  node.to_s
end

#node_text(node) ⇒ Object

Return the text of the element, sanitized to be displayed



134
135
136
# File 'lib/record_extractor.rb', line 134

def node_text(node)
  node.content.gsub('<', '&lt;').gsub('>', '&gt;')
end

#slugObject

Returns the slug of the document



45
46
47
48
49
50
51
52
53
54
# File 'lib/record_extractor.rb', line 45

def slug
  # Jekyll v3 has it in data
  return @file.data['slug'] if @file.data.key?('slug')
  # Old Jekyll v2 has it at the root
  return @file.slug if @file.respond_to? :slug
  # Otherwise, we guess it from the filename
  basename = File.basename(@file.path)
  extname = File.extname(basename)
  File.basename(basename, extname)
end

#tagsObject

Extract a list of tags



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/record_extractor.rb', line 57

def tags
  tags = nil

  # Jekyll v3 has it in data, while v2 have it at the root
  if @file.data.key?('tags')
    tags = @file.data['tags']
  elsif @file.respond_to? :tags
    tags = @file.tags
  end

  return tags if tags.nil?

  # Anyway, we force cast it to string as some plugins will extend the tags to
  # full featured objects
  tags.map(&:to_s)
end

#unique_hierarchy(data) ⇒ Object

Returns a unique string of hierarchy from title to h6, used for distinct



139
140
141
142
# File 'lib/record_extractor.rb', line 139

def unique_hierarchy(data)
  headings = %w(title h1 h2 h3 h4 h5 h6)
  headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
end

#weight(item, index) ⇒ Object

Returns an object of all weights



182
183
184
185
186
187
188
# File 'lib/record_extractor.rb', line 182

def weight(item, index)
  {
    tag_name: weight_tag_name(item),
    heading_relevance: weight_heading_relevance(item),
    position: index
  }
end

#weight_heading_relevance(data) ⇒ Object

The more words are in common between this node and its parent heading, the higher the score



158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/record_extractor.rb', line 158

def weight_heading_relevance(data)
  # Get list of unique words in headings
  title_words = %i(title h1 h2 h3 h4 h5 h6)
                .select { |title| data.key?(title) }
                .map { |title| data[title].to_s.split(/\W+/) }
                .flatten
                .compact
                .map(&:downcase)
                .uniq
  # Intersect words in headings with words in test
  text_words = data[:text].downcase.split(/\W+/)
  (title_words & text_words).size
end

#weight_tag_name(item) ⇒ Object

Returns a weight based on the tag_name



173
174
175
176
177
178
179
# File 'lib/record_extractor.rb', line 173

def weight_tag_name(item)
  tag_name = item[:tag_name]
  # No a heading, no weight
  return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
  # h1: 100, h2: 90, ..., h6: 50
  100 - (tag_name.delete('h').to_i - 1) * 10
end