Class: TextTagIterator
Overview
Tagiter
Simple but very useful HTML/XHTML cascading parser for those quick and dirty web page parse jobs.
Usage
# sample html
stext = <<-EOF
<body> This is a test...
<sub> S1 </sub> <sub> S2 </sub>
<DL>
<DT> A1
<DT> A2
<DT> A3
</DL>
<DL>
<DT> B1
<DT> B2
<DT> B3
</DL>
<NEST>
<P ALIGN="R">TOP</P>
<NEST>
<P>SECOND</P>
<OL>
<LI>C1
<LI>C2
<LI>C3
<LI>C4
</OL>
</NEST>
<OL>
<LI>D1
<LI>D2
<LI>D3
<LI>D4
</OL>
</NEST>
</body>
EOF
a = TextTagIterator.new(stext)
a.first("body") do |y|
y.nth("dl",2) do |dl|
dl.enumtag("dt") do |t|
puts t.text.strip
end
end
y.first("nest") do |n|
n.first("p") do |c|
print c.text, ' '
puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
end.next("nest") do |m|
m.first("p") do |c|
puts c.text
end.next("ol") do |o|
o.enumtag("li") do |i| puts i.text.strip end
end
end.next("ol") do |o|
o.enumtag("li") do |i| puts i.text.strip end
end
end
end
a.each_block("sub") do |y|
puts y.text.strip
end
produces
B1
B2
B3
TOP align=R
SECOND
C1
C2
C3
C4
D1
D2
D3
D4
S1
S2
Author
*ɂႷ <[email protected]>
Legal
Copyright (c) 2000 Ⴗ <[email protected]>
History
-
2004/11/30 added to collection
-
2000/09/18 made attribute name case-ignored.
-
2000/06/06 added a new method : nth_trailer(tag,n).
fixed regexp of tagnext method. -
2000/06/05 added new methods for information. tagexist?(tag) and tagnext.
fixed bugs on return values of each_block -
2000/06/01 fixed nth method which sometimes crashes at the end-of-file.
(reported by Matsui-san) -
2000/05/08 fixed enumtag which yields only half of texts in some conditions
-
2000/04/27 regexp debugged in parse_attribute
-
2000/03/12 started
Instance Attribute Summary collapse
-
#attributes ⇒ Object
readonly
Returns the value of attribute attributes.
-
#option ⇒ Object
readonly
Returns the value of attribute option.
-
#tag ⇒ Object
readonly
Returns the value of attribute tag.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
Class Method Summary collapse
Instance Method Summary collapse
- #collect(*arg) ⇒ Object
- #each_block(tag, closetag = nil) ⇒ Object
- #enumcollect(tag) ⇒ Object
- #enumtag(tag) ⇒ Object
- #first(tag, *arg) ⇒ Object (also: #next)
- #for_this {|_self| ... } ⇒ Object
- #get_first(*arg) ⇒ Object
- #get_nth(*arg) ⇒ Object
- #nth(tag, n, closetag = nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ... } ⇒ Object
- #nth_tailer(tag, n) ⇒ Object
- #tagexist?(tag, st = 0) ⇒ Boolean
- #tagnext ⇒ Object
Instance Attribute Details
#attributes ⇒ Object (readonly)
Returns the value of attribute attributes.
117 118 119 |
# File 'lib/carat/tagiter.rb', line 117 def attributes @attributes end |
#option ⇒ Object (readonly)
Returns the value of attribute option.
115 116 117 |
# File 'lib/carat/tagiter.rb', line 115 def option @option end |
#tag ⇒ Object (readonly)
Returns the value of attribute tag.
116 117 118 |
# File 'lib/carat/tagiter.rb', line 116 def tag @tag end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
114 115 116 |
# File 'lib/carat/tagiter.rb', line 114 def text @text end |
Class Method Details
.[](aname) ⇒ Object
126 127 128 |
# File 'lib/carat/tagiter.rb', line 126 def @attributes.[](aname) super aname.downcase end |
Instance Method Details
#collect(*arg) ⇒ Object
227 228 229 230 231 |
# File 'lib/carat/tagiter.rb', line 227 def collect(*arg) a=[] each_block(*arg) do |tt| a.push tt end a end |
#each_block(tag, closetag = nil) ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/carat/tagiter.rb', line 203 def each_block(tag,closetag=nil) t=0 s,d =find_opentag(tag) raise RuntimeError,"tag(#{tag}) not found" unless s while s do if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) if e>=0 then t=@text.index('>',e+1) t=@text.length unless t s,d = find_opentag(tag,t) else s=false end end self.class.new(text[t+1..-1]) end |
#enumcollect(tag) ⇒ Object
243 244 245 246 247 |
# File 'lib/carat/tagiter.rb', line 243 def enumcollect(tag) a=[] enumtag(tag) do |t| a.push t end a end |
#enumtag(tag) ⇒ Object
233 234 235 236 237 238 239 240 241 |
# File 'lib/carat/tagiter.rb', line 233 def enumtag(tag) s,d = find_openenumtag(tag) while s do e=find_closeenumtag(tag,s+1) e=-1 unless e yield self.class.new(@text[s..e],tag,parse_attribute(d)) s,d = find_openenumtag(tag,s) end end |
#first(tag, *arg) ⇒ Object Also known as: next
200 |
# File 'lib/carat/tagiter.rb', line 200 def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end |
#for_this {|_self| ... } ⇒ Object
249 250 251 |
# File 'lib/carat/tagiter.rb', line 249 def for_this yield self end |
#get_first(*arg) ⇒ Object
254 |
# File 'lib/carat/tagiter.rb', line 254 def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end |
#get_nth(*arg) ⇒ Object
253 |
# File 'lib/carat/tagiter.rb', line 253 def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end |
#nth(tag, n, closetag = nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ... } ⇒ Object
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/carat/tagiter.rb', line 177 def nth(tag,n,closetag=nil) raise RuntimeError,"nth: number not specified" unless n t=0 e=s=0 # for their scope d=nil 1.upto(n) do |i| s,d = find_opentag(tag,t) raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s if closetag then e=find_closetag(closetag,s,tag) else e=find_closetag(tag,s) end e=-1 unless e t=@text.index('>',e+1) t=@text.length unless t end yield self.class.new(text[s..e],tag,parse_attribute(d)) self.class.new(text[t+1..-1]) end |
#nth_tailer(tag, n) ⇒ Object
269 270 271 |
# File 'lib/carat/tagiter.rb', line 269 def nth_tailer(tag,n) nth(tag,n) do end end |
#tagexist?(tag, st = 0) ⇒ Boolean
256 257 258 259 |
# File 'lib/carat/tagiter.rb', line 256 def tagexist?(tag,st=0) s=find_element(tag,st) if s then true else false end end |
#tagnext ⇒ Object
261 262 263 264 265 266 267 |
# File 'lib/carat/tagiter.rb', line 261 def tagnext s=@text.index("<") return nil unless s e=@text.index(">",s) return nil unless s @text[s..e].scan(/[^<>\s]+/)[0] end |