Class: Boilerpipe::Filters::TerminatingBlocksFinder

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/terminating_blocks_finder.rb

Class Method Summary collapse

Class Method Details

.finds_match?(text) ⇒ Boolean

Returns:

  • (Boolean)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/boilerpipe/filters/terminating_blocks_finder.rb', line 21

def self.finds_match?(text)
  text.start_with?('comments') ||
    text =~ /^\d+ (comments|users responded in)/ || # starts with number
    text.start_with?('© reuters') ||
    text.start_with?('please rate this') ||
    text.start_with?('post a comment') ||
    text.include?('what you think...') ||
    text.include?('add your comment') ||
    text.include?('add comment') ||
    # TODO add this and test
    # text.include?('leave a reply') ||
    # text.include?('leave a comment') ||
    # text.include?('show comments') ||
    # text.include?('Share this:') ||
    text.include?('reader views') ||
    text.include?('have your say') ||
    text.include?('reader comments') ||
    text.include?('rätta artikeln') ||
    text == 'thanks for your comments - this feedback is now closed'
end

.process(doc) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/boilerpipe/filters/terminating_blocks_finder.rb', line 7

def self.process(doc)
  doc.text_blocks.each do |tb|
    next unless tb.num_words < 15

    if tb.text.length >= 8 && finds_match?(tb.text.downcase)
      tb.labels << :INDICATES_END_OF_TEXT
    elsif tb.link_density == 1.0 && tb.text == 'comment'
      tb.labels << :INDICATES_END_OF_TEXT
    end
  end

  doc
end