Class: Boilerpipe::Filters::MinClauseWordsFilter
- Inherits:
-
Object
- Object
- Boilerpipe::Filters::MinClauseWordsFilter
- Defined in:
- lib/boilerpipe/filters/min_clause_words_filter.rb
Class Method Summary collapse
Class Method Details
.is_clause?(text, min_words = 5) ⇒ Boolean
27 28 29 30 31 32 |
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 27 def self.is_clause?(text, min_words = 5) return false if text.nil? whitespace = /[ \n\r]+/ text.scan(whitespace).size >= min_words end |
.process(doc, min_words = 5) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 11 def self.process(doc, min_words = 5) doc.text_blocks.each do |tb| next if tb.is_not_content? clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/ hasClause = false tb.text.scan(clause_delimiter).each do |possible_clause| hasClause |= is_clause? possible_clause end tb.content = false unless hasClause end doc end |