Class: Boilerpipe::Filters::DocumentTitleMatchClassifier

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/document_title_match_classifier.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(title) ⇒ DocumentTitleMatchClassifier

Returns a new instance of DocumentTitleMatchClassifier.



12
13
14
15
# File 'lib/boilerpipe/filters/document_title_match_classifier.rb', line 12

def initialize(title)
  @potential_titles = Set.new
  generate_potential_titles(title)
end

Instance Attribute Details

#potential_titlesObject (readonly)

Returns the value of attribute potential_titles.



10
11
12
# File 'lib/boilerpipe/filters/document_title_match_classifier.rb', line 10

def potential_titles
  @potential_titles
end

Instance Method Details

#process(doc) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/boilerpipe/filters/document_title_match_classifier.rb', line 17

def process(doc)
  return doc if @potential_titles.empty?

  doc.text_blocks.each do |tb|
    text = tb.text.gsub('\u00a0', ' ')
      .gsub("'", '')
      .strip.downcase

    if @potential_titles.member? text
      tb.add_label :TITLE
      break
    end

    remove_characters = /[?!.-:]+/
    text = text.gsub(remove_characters, '').strip

    if @potential_titles.member? text
      tb.add_label :TITLE
      break
    end
  end

  doc
end