Class: Boilerpipe::SAX::BoilerpipeHTMLParser

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/sax/boilerpipe_html_parser.rb

Class Method Summary collapse

Class Method Details

.parse(text) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/boilerpipe/sax/boilerpipe_html_parser.rb', line 3

def self.parse(text)
  # script bug - delete script tags
  text.gsub!(/\<script>.+?<\/script>/i, '')

  # nokogiri uses libxml for mri and nekohtml for jruby
  # mri doesn't remove &nbsp; when missing the semicolon
  text.gsub!(/(&nbsp) /, '\1; ')

  # use nokogiri to fix any bad tags, errors - keep experimenting with this
  text = Nokogiri::HTML(text).to_html

  handler = HTMLContentHandler.new
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
  noko_parser.parse(text)
  handler.text_document
end