Class: Boilerpipe::Filters::KeepLargestBlockFilter
- Inherits:
-
Object
- Object
- Boilerpipe::Filters::KeepLargestBlockFilter
- Defined in:
- lib/boilerpipe/filters/keep_largest_block_filter.rb
Constant Summary collapse
- INSTANCE =
KeepLargestBlockFilter.new(false, 0)
- INSTANCE_EXPAND_TO_SAME_TAGLEVEL =
KeepLargestBlockFilter.new(true, 0)
- INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS =
KeepLargestBlockFilter.new(true, 150)
Instance Method Summary collapse
-
#expand_tag_level(tbs, level, min_words) ⇒ Object
sets content to true.
-
#initialize(expand_to_same_level_text, min_words) ⇒ KeepLargestBlockFilter
constructor
A new instance of KeepLargestBlockFilter.
- #process(doc) ⇒ Object
Constructor Details
#initialize(expand_to_same_level_text, min_words) ⇒ KeepLargestBlockFilter
Returns a new instance of KeepLargestBlockFilter.
10 11 12 13 |
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 10 def initialize(, min_words) @expand_to_same_level_text = @min_words = min_words end |
Instance Method Details
#expand_tag_level(tbs, level, min_words) ⇒ Object
sets content to true
49 50 51 52 53 54 55 56 57 |
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 49 def (tbs, level, min_words) tbs.each do |tb| if tb.tag_level < level break elsif tb.tag_level == level tb.content = true if tb.num_words >= min_words end end end |
#process(doc) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 19 def process(doc) tbs = doc.text_blocks return false if tbs.size < 2 # find tb with the most words largest_block = tbs.select(&:is_content?).max_by(&:num_words) level = @expand_to_same_level_text ? largest_block.tag_level : -1 # set labels for text blocks tbs.each do |tb| if tb == largest_block tb.content = true tb.add_label :VERY_LIKELY_CONTENT else tb.content = false tb.add_label :MIGHT_BE_CONTENT end end n = tbs.index(largest_block) if @expand_to_same_level_text && n # expand blocks to the left (tbs[0...n].reverse, level, @min_words) # expand blocks to the right (tbs[n + 1..-1], level, @min_words) end end |