Class: XRay::HTML::Parser

Inherits:
BaseParser show all
Defined in:
lib/html/parser.rb

Direct Known Subclasses

VisitableParser

Constant Summary collapse

TEXT =
/[^<]+/m
PROP_NAME =
%r/\w[-:\w]*/m
PROP_VALUE =
%r/'([^']*)'|"([^"]*)"|([^\s>]+)/m
PROP =
%r/#{PROP_NAME}\s*(?:=\s*#{PROP_VALUE})?/m
TAG_NAME =
/\w[^>\(\)\/\s]*/
TAG_START =
%r/<(#{TAG_NAME})/m
TAG_END =
%r/<\/#{TAG_NAME}\s*>/m
TAG =
%r/#{TAG_START}(\s+#{PROP})*\s*>/m
SELF_CLOSE_TAG =
%r/#{TAG_START}(\s+#{PROP})*\s*\/>/m
DTD =
/\s*<!(doctype)\s+(.*?)>/im
COMMENT =
/<!--(.*?)-->/m

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from BaseParser

#batch, #check, #eos?, #initialize, #raw_scan, #reset, #scan, #skip, #skip_empty, #to_s

Constructor Details

This class inherits a constructor from XRay::BaseParser

Class Method Details

.parse(src) {|doc| ... } ⇒ Object

Yields:

  • (doc)


10
11
12
13
14
15
# File 'lib/html/parser.rb', line 10

def self.parse(src, &block)
  parser = self.new(src)
  doc = parser.parse
  yield doc if block_given? 
  doc
end

Instance Method Details

#parseObject



29
30
31
# File 'lib/html/parser.rb', line 29

def parse
  parse_doc
end

#parse_commentObject



63
64
65
66
# File 'lib/html/parser.rb', line 63

def parse_comment
  scan COMMENT
  CommentElement.new(@scanner[1])
end

#parse_docObject



33
34
35
36
37
38
39
40
41
# File 'lib/html/parser.rb', line 33

def parse_doc
  nodes = batch(:parse_element)
  case nodes.size
    when 0 then nil
    when 1 then nodes[0]
    else 
      ::XRay::HTML::Document.new( nodes )
  end
end

#parse_dtdObject



58
59
60
61
# File 'lib/html/parser.rb', line 58

def parse_dtd
  node = scan(DTD)
  DTDElement.new(@scanner[2], @scanner[1], node.position)
end

#parse_elementObject



43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/html/parser.rb', line 43

def parse_element
  if @scanner.check(DTD) and !@dtd_checked
    @dtd_checked = true
    parse_dtd
  elsif @scanner.check(COMMENT)
    parse_comment
  elsif @scanner.check(TAG_START)
    parse_tag
  elsif !text_end?
    parse_text
  else
    parse_error('Invalid HTML struct')
  end
end

#parse_prop_nameObject



113
114
115
# File 'lib/html/parser.rb', line 113

def parse_prop_name
  scan PROP_NAME
end

#parse_prop_valueObject



117
118
119
120
# File 'lib/html/parser.rb', line 117

def parse_prop_value
  scan PROP_VALUE
  "#{@scanner[1]}#{@scanner[2]}#{@scanner[3]}"
end

#parse_propertiesObject



92
93
94
95
96
97
98
99
100
101
# File 'lib/html/parser.rb', line 92

def parse_properties
  skip_empty
  props = []
  until prop_search_done? do
    prop = parse_property
    props << prop if prop
    skip_empty
  end
  props
end

#parse_propertyObject



103
104
105
106
107
108
109
110
111
# File 'lib/html/parser.rb', line 103

def parse_property
  name = parse_prop_name
  if @scanner.check( /\s*=/ )
    skip /[=]/
    sep = @scanner.check(/['"]/)
    value = parse_prop_value
  end
  Property.new name, value, sep
end

#parse_tagObject



80
81
82
83
84
85
86
87
88
89
90
# File 'lib/html/parser.rb', line 80

def parse_tag
  if @scanner.check DTD
    parse_dtd_tag
  elsif @scanner.check SELF_CLOSE_TAG
    parse_self_ending_tag
  elsif @scanner.check TAG
    parse_normal_tag
  else
    parse_error('Invalid HTML struct')
  end
end

#parse_textObject



68
69
70
71
72
73
74
75
76
77
78
# File 'lib/html/parser.rb', line 68

def parse_text
  text = ''
  until text_end? do
    text << '<' if @scanner.skip(/</)
    text << "#{@scanner.scan(TEXT)}"

    # TODO: make this detection a rule
    parse_warn "'#{$~}' not escaped" if text =~ /<|>/
  end
  TextElement.new text
end