Class: Boilerpipe::Filters::KeepLargestBlockFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/keep_largest_block_filter.rb

Constant Summary collapse

INSTANCE =
KeepLargestBlockFilter.new(false, 0)
INSTANCE_EXPAND_TO_SAME_TAGLEVEL =
KeepLargestBlockFilter.new(true, 0)
INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS =
KeepLargestBlockFilter.new(true, 150)

Instance Method Summary collapse

Constructor Details

#initialize(expand_to_same_level_text, min_words) ⇒ KeepLargestBlockFilter

Returns a new instance of KeepLargestBlockFilter.



10
11
12
13
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 10

def initialize(expand_to_same_level_text, min_words)
  @expand_to_same_level_text = expand_to_same_level_text
  @min_words = min_words
end

Instance Method Details

#expand_tag_level(tbs, level, min_words) ⇒ Object

sets content to true



49
50
51
52
53
54
55
56
57
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 49

def expand_tag_level(tbs, level, min_words)
  tbs.each do |tb|
    if tb.tag_level < level
      break
    elsif tb.tag_level == level
      tb.content = true if tb.num_words >= min_words
    end
  end
end

#process(doc) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/boilerpipe/filters/keep_largest_block_filter.rb', line 19

def process(doc)
  tbs = doc.text_blocks
  return false if tbs.size < 2

  # find tb with the most words
  largest_block = tbs.select(&:is_content?).max_by(&:num_words)
  level = @expand_to_same_level_text ? largest_block.tag_level : -1

  # set labels for text blocks
  tbs.each do |tb|
    if tb == largest_block
      tb.content = true
      tb.add_label :VERY_LIKELY_CONTENT
    else
      tb.content = false
      tb.add_label :MIGHT_BE_CONTENT
    end
  end

  n = tbs.index(largest_block)
  if @expand_to_same_level_text && n
    # expand blocks to the left
    expand_tag_level(tbs[0...n].reverse, level, @min_words)

    # expand blocks to the right
    expand_tag_level(tbs[n + 1..-1], level, @min_words)
  end
end