Class: HTML::HTMLParser

Inherits:
SGMLParser show all
Defined in:
lib/html/htmlparser.rb

Overview

(X)HTML parser.

Parses a String and returns an REXML::Document with the (X)HTML content.

For example:

html = "<p>paragraph</p>"
parser = HTMLParser.new(html)
puts parser.document

Requires a patched version of SGMLParser.

Constant Summary

Constants inherited from SGMLParser

SGMLParser::Attrfind, SGMLParser::Charref, SGMLParser::Commentclose, SGMLParser::Commentopen, SGMLParser::Endbracket, SGMLParser::Endtagopen, SGMLParser::Entitydefs, SGMLParser::Entityref, SGMLParser::Incomplete, SGMLParser::Interesting, SGMLParser::Special, SGMLParser::Starttagopen, SGMLParser::Tagfind

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from SGMLParser

#close, #feed, #finish_endtag, #finish_starttag, #goahead, #handle_charref, #handle_endtag, #handle_entityref, #handle_starttag, #has_context, #parse_comment, #parse_endtag, #parse_special, #parse_starttag, #report_unbalanced, #reset, #setliteral, #setnomoretags

Constructor Details

#initializeHTMLParser

Returns a new instance of HTMLParser.



368
369
370
371
372
# File 'lib/html/htmlparser.rb', line 368

def initialize()
    super
    @document = HTML::Document.new("")
    @current = @document.root
end

Instance Attribute Details

#documentObject (readonly)

Returns the value of attribute document.



360
361
362
# File 'lib/html/htmlparser.rb', line 360

def document
  @document
end

Class Method Details

.parse(html) ⇒ Object



362
363
364
365
366
# File 'lib/html/htmlparser.rb', line 362

def self.parse(html)
    parser = HTMLParser.new
    parser.feed(html)
    parser.document
end

Instance Method Details

#handle_comment(data) ⇒ Object



378
379
# File 'lib/html/htmlparser.rb', line 378

def handle_comment(data)
end

#handle_data(data) ⇒ Object



374
375
376
# File 'lib/html/htmlparser.rb', line 374

def handle_data(data)
    @current.children << HTML::Text.new(@current, 0, 0, data)
end

#handle_special(data) ⇒ Object



381
382
# File 'lib/html/htmlparser.rb', line 381

def handle_special(data)
end

#unknown_charref(ref) ⇒ Object



398
399
# File 'lib/html/htmlparser.rb', line 398

def unknown_charref(ref)
end

#unknown_endtag(tag) ⇒ Object



394
395
396
# File 'lib/html/htmlparser.rb', line 394

def unknown_endtag(tag)
    @current = @current.parent if @current.parent
end

#unknown_entityref(ref) ⇒ Object



401
402
403
# File 'lib/html/htmlparser.rb', line 401

def unknown_entityref(ref)
    @current.children << HTML::Text.new(@current, 0, 0, "&amp;#{ref}&lt;")
end

#unknown_starttag(tag, attrs) ⇒ Object



384
385
386
387
388
389
390
391
392
# File 'lib/html/htmlparser.rb', line 384

def unknown_starttag(tag, attrs)
    attrs = attrs.inject({}) do |hash, attr|
        hash[attr[0].downcase] = attr[1]
        hash
    end
    element = HTML::Tag.new(@current || @document, 0, 0, tag.downcase, attrs, true)
    @current.children << element
    @current = element
end