Class: Newstile::Parser::Html

Inherits:
Base
  • Object
show all
Includes:
Parser
Defined in:
lib/newstile/parser/html.rb

Overview

Used for parsing a HTML document.

Defined Under Namespace

Modules: Constants, Parser Classes: ElementConverter

Constant Summary

Constants included from Parser

Parser::HTML_RAW_START

Constants included from Constants

Constants::HTML_ATTRIBUTE_RE, Constants::HTML_BLOCK_ELEMENTS, Constants::HTML_COMMENT_RE, Constants::HTML_DOCTYPE_RE, Constants::HTML_ELEMENTS_WITHOUT_BODY, Constants::HTML_ENTITY_RE, Constants::HTML_INSTRUCTION_RE, Constants::HTML_PARSE_AS, Constants::HTML_PARSE_AS_BLOCK, Constants::HTML_PARSE_AS_RAW, Constants::HTML_PARSE_AS_SPAN, Constants::HTML_SPAN_ELEMENTS, Constants::HTML_TAG_CLOSE_RE, Constants::HTML_TAG_RE

Instance Method Summary collapse

Methods included from Parser

#handle_html_script_tag, #handle_html_start_tag, #parse_raw_html

Methods inherited from Base

#adapt_source, #add_text, #extract_string, #initialize, parse, #warning

Constructor Details

This class inherits a constructor from Newstile::Parser::Base

Instance Method Details

#parse(source) ⇒ Object

Parse source as HTML document and return the created tree.



466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
# File 'lib/newstile/parser/html.rb', line 466

def parse(source)
  @stack = []
  @tree = Element.new(:root)
  @src = StringScanner.new(adapt_source(source))

  while true
    if result = @src.scan(/\s*#{HTML_INSTRUCTION_RE}/)
      @tree.children << Element.new(:xml_pi, result.strip, nil, :category => :block)
    elsif result = @src.scan(/\s*#{HTML_DOCTYPE_RE}/)
      @tree.children << Element.new(:html_doctype, result.strip, nil, :category => :block)
    elsif result = @src.scan(/\s*#{HTML_COMMENT_RE}/)
      @tree.children << Element.new(:xml_comment, result.strip, nil, :category => :block)
    else
      break
    end
  end

  tag_handler = lambda do |c, closed|
    parse_raw_html(c, &tag_handler) if !closed
  end
  parse_raw_html(@tree, &tag_handler)

  ec = ElementConverter.new(@doc)
  @tree.children.each {|c| ec.process(c)}
  ec.remove_whitespace_children(@tree)
  @tree
end