Class: Boilerpipe::Filters::ExpandTitleToContentFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/expand_title_to_content_filter.rb

Class Method Summary collapse

Class Method Details

.no_title_with_subsequent_content?(content_start, title_idx) ⇒ Boolean

Returns:

  • (Boolean)


25
26
27
28
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 25

def self.no_title_with_subsequent_content?(content_start, title_idx)
  # title has to start before content
  title_idx.nil? || content_start.nil? || title_idx >= content_start
end

.process(doc) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 8

def self.process(doc)
  tbs = doc.text_blocks

  title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
  title_idx = tbs.index(title)

  content_start = tbs.find_index(&:is_content?)

  return doc if no_title_with_subsequent_content?(content_start, title_idx)

  tbs.slice(title_idx...content_start)
    .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
    .each{ |tb| tb.content = true }

  doc
end