Class: XRay::HTML::Parser
Constant Summary
collapse
- TEXT =
/[^<]+/m
- PROP_NAME =
%r/\w[-:\w]*/m
- PROP_VALUE =
%r/'([^']*)'|"([^"]*)"|([^\s>]+)/m
- PROP =
%r/#{PROP_NAME}\s*(?:=\s*#{PROP_VALUE})?/m
- TAG_NAME =
/\w[^>\(\)\/\s]*/
- TAG_START =
%r/<(#{TAG_NAME})/m
- TAG_END =
%r/<\/#{TAG_NAME}\s*>/m
- TAG =
%r/#{TAG_START}(\s+#{PROP})*\s*>/m
- SELF_CLOSE_TAG =
%r/#{TAG_START}(\s+#{PROP})*\s*\/>/m
- DTD =
/\s*<!(doctype)\s+(.*?)>/im
/<!--(.*?)-->/m
Class Method Summary
collapse
Instance Method Summary
collapse
Methods inherited from BaseParser
#batch, #check, #eos?, #initialize, #raw_scan, #reset, #scan, #skip, #skip_empty, #to_s
Class Method Details
.parse(src) {|doc| ... } ⇒ Object
10
11
12
13
14
15
|
# File 'lib/html/parser.rb', line 10
def self.parse(src, &block)
parser = self.new(src)
doc = parser.parse
yield doc if block_given?
doc
end
|
Instance Method Details
#parse ⇒ Object
29
30
31
|
# File 'lib/html/parser.rb', line 29
def parse
parse_doc
end
|
#parse_doc ⇒ Object
33
34
35
36
37
38
39
40
41
|
# File 'lib/html/parser.rb', line 33
def parse_doc
nodes = batch(:parse_element)
case nodes.size
when 0 then nil
when 1 then nodes[0]
else
::XRay::HTML::Document.new( nodes )
end
end
|
#parse_dtd ⇒ Object
58
59
60
61
|
# File 'lib/html/parser.rb', line 58
def parse_dtd
node = scan(DTD)
DTDElement.new(@scanner[2], @scanner[1], node.position)
end
|
#parse_element ⇒ Object
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
# File 'lib/html/parser.rb', line 43
def parse_element
if @scanner.check(DTD) and !@dtd_checked
@dtd_checked = true
parse_dtd
elsif @scanner.check(COMMENT)
elsif @scanner.check(TAG_START)
parse_tag
elsif !text_end?
parse_text
else
parse_error('Invalid HTML struct')
end
end
|
#parse_prop_name ⇒ Object
113
114
115
|
# File 'lib/html/parser.rb', line 113
def parse_prop_name
scan PROP_NAME
end
|
#parse_prop_value ⇒ Object
117
118
119
120
|
# File 'lib/html/parser.rb', line 117
def parse_prop_value
scan PROP_VALUE
"#{@scanner[1]}#{@scanner[2]}#{@scanner[3]}"
end
|
#parse_properties ⇒ Object
92
93
94
95
96
97
98
99
100
101
|
# File 'lib/html/parser.rb', line 92
def parse_properties
skip_empty
props = []
until prop_search_done? do
prop = parse_property
props << prop if prop
skip_empty
end
props
end
|
#parse_property ⇒ Object
103
104
105
106
107
108
109
110
111
|
# File 'lib/html/parser.rb', line 103
def parse_property
name = parse_prop_name
if @scanner.check( /\s*=/ )
skip /[=]/
sep = @scanner.check(/['"]/)
value = parse_prop_value
end
Property.new name, value, sep
end
|
#parse_tag ⇒ Object
80
81
82
83
84
85
86
87
88
89
90
|
# File 'lib/html/parser.rb', line 80
def parse_tag
if @scanner.check DTD
parse_dtd_tag
elsif @scanner.check SELF_CLOSE_TAG
parse_self_ending_tag
elsif @scanner.check TAG
parse_normal_tag
else
parse_error('Invalid HTML struct')
end
end
|
#parse_text ⇒ Object
68
69
70
71
72
73
74
75
76
77
78
|
# File 'lib/html/parser.rb', line 68
def parse_text
text = ''
until text_end? do
text << '<' if @scanner.skip(/</)
text << "#{@scanner.scan(TEXT)}"
parse_warn "'#{$~}' not escaped" if text =~ /<|>/
end
TextElement.new text
end
|