Class: Boilerpipe::Filters::ExpandTitleToContentFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/expand_title_to_content_filter.rb

Class Method Summary collapse

Class Method Details

.no_title_with_subsequent_content?(content_start, title) ⇒ Boolean

Returns:

  • (Boolean)


36
37
38
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 36

def self.no_title_with_subsequent_content?(content_start, title)
  title.nil? || content_start.nil? || content_start <= title
end

.process(doc) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 8

def self.process(doc)
  tbs = doc.text_blocks

  #     slower and more ruby-like
  #     comeback and let's do some benchmarking
  #     titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
  #     title = tbs.index(titles.last)
  #     content_start = tbs.find_index(&:is_content?)

  i = 0
  title = nil
  content_start = nil

  tbs.each do |tb|
    title = i if content_start.nil? && tb.has_label?(:TITLE)
    content_start = i if content_start.nil? && tb.is_content?
    i += 1
  end

  return doc if no_title_with_subsequent_content?(content_start, title)

  tbs.slice(title...content_start).each do |tb|
    tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
  end

  doc
end