Class: Boilerpipe::Filters::IgnoreBlocksAfterContentFilter

Inherits:
HeuristicFilterBase show all
Defined in:
lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb

Class Method Summary collapse

Methods inherited from HeuristicFilterBase

num_full_text_words

Class Method Details

.process(doc, min_num_words = 60) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb', line 8

def self.process(doc, min_num_words = 60)
  found_end_of_text = false
  num_words = 0

  doc.text_blocks.each do |tb|
    end_of_text = tb.has_label? :INDICATES_END_OF_TEXT
    num_words += num_full_text_words(tb) if tb.is_content?
    found_end_of_text = true if end_of_text && num_words >= min_num_words
    tb.content = false if found_end_of_text
  end

  doc
end