Class: Boilerpipe::Filters::CanolaClassifier

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/canola_classifier.rb

Class Method Summary collapse

Class Method Details

.classify(prev, current, nxt) ⇒ Object



21
22
23
24
25
# File 'lib/boilerpipe/filters/canola_classifier.rb', line 21

def self.classify(prev, current, nxt)
  current.link_density > 0 && nxt.num_words > 11 \
    || current.num_words > 19 \
    || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
end

.process(doc) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/boilerpipe/filters/canola_classifier.rb', line 7

def self.process(doc)
  return doc if doc.text_blocks.size < 1

  empty = Boilerpipe::Document::TextBlock.empty_start
  text_blocks = [empty] + doc.text_blocks + [empty]

  text_blocks.each_cons(3) do |slice|
    prev, current, nxt = *slice
    current.content = classify(prev, current, nxt)
  end

  doc
end