Class: Boilerpipe::Extractors::ArticleExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/extractors/article_extractor.rb

Class Method Summary collapse

Class Method Details

.process(doc) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/boilerpipe/extractors/article_extractor.rb', line 9

def self.process(doc)
  title = doc.title

  filters = ::Boilerpipe::Filters

  # marks text blocks as end of text with :INDICATES_END_OF_TEXT
  filters::TerminatingBlocksFinder.process doc

  # marks text blocks as title
  filters::DocumentTitleMatchClassifier.new(title).process doc

  # marks text blocks as content / non-content using boilerpipe alg
  filters::NumWordsRulesClassifier.process doc

  # marks text blocks after INDICATES_END_OF_TEXT non-content
  filters::IgnoreBlocksAfterContentFilter.process doc

  # marks HEADING text blocks as non-content after existing content
  filters::TrailingHeadlineToBoilerplateFilter.process doc

  # merge text blocks next to each other
  filters::BlockProximityFusion::MAX_DISTANCE_1.process doc

  # removes non-content text blocks
  filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc

  # merge text blocks next to each other if they are the same tag level
  filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc

  # Keeps only the largest text block as content
  filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc

  # Marks all TextBlocks "content" which are between the headline and the part is already content
  filters::ExpandTitleToContentFilter.process doc

  # mark text blocks with a lot of text at the same tag level as the largest current content as additional content
  filters::LargeBlockSameTagLevelToContentFilter.process doc

  # Marks nested list-item blocks after the end of the main content as content.
  filters::ListAtEndFilter.process doc

  doc
end

.text(contents) ⇒ Object



3
4
5
6
7
# File 'lib/boilerpipe/extractors/article_extractor.rb', line 3

def self.text(contents)
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
  ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
  doc.content
end