Class: Boilerpipe::Filters::MinClauseWordsFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/min_clause_words_filter.rb

Class Method Summary collapse

Class Method Details

.is_clause?(text, min_words = 5) ⇒ Boolean

Returns:

  • (Boolean)


27
28
29
30
31
32
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 27

def self.is_clause?(text, min_words = 5)
  return false if text.nil?

  whitespace = /[ \n\r]+/
  text.scan(whitespace).size >= min_words
end

.process(doc, min_words = 5) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 11

def self.process(doc, min_words = 5)
  doc.text_blocks.each do |tb|
    next if tb.is_not_content?

    clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
    hasClause = false
    tb.text.scan(clause_delimiter).each do |possible_clause|
      hasClause |= is_clause? possible_clause
    end

    tb.content = false unless hasClause
  end

  doc
end