Class: Boilerpipe::Filters::ExpandTitleToContentFilter
- Inherits:
-
Object
- Object
- Boilerpipe::Filters::ExpandTitleToContentFilter
- Defined in:
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
Class Method Summary collapse
Class Method Details
.no_title_with_subsequent_content?(content_start, title_idx) ⇒ Boolean
25 26 27 28 |
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 25 def self.no_title_with_subsequent_content?(content_start, title_idx) # title has to start before content title_idx.nil? || content_start.nil? || title_idx >= content_start end |
.process(doc) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 8 def self.process(doc) tbs = doc.text_blocks title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last title_idx = tbs.index(title) content_start = tbs.find_index(&:is_content?) return doc if no_title_with_subsequent_content?(content_start, title_idx) tbs.slice(title_idx...content_start) .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) } .each{ |tb| tb.content = true } doc end |