Class: TextTagIterator

Inherits:
Object show all
Defined in:
lib/carat/tagiter.rb

Overview

Tagiter

Simple but very useful HTML/XHTML cascading parser for those quick and dirty web page parse jobs.

Usage

# sample html
stext = <<-EOF
<body> This is a test...
  <sub> S1 </sub> <sub> S2 </sub>
  <DL>
    <DT> A1
    <DT> A2
    <DT> A3
  </DL>
  <DL>
    <DT> B1
    <DT> B2
    <DT> B3
  </DL>
  <NEST>
    <P ALIGN="R">TOP</P>
    <NEST>
      <P>SECOND</P>
      <OL>
        <LI>C1
        <LI>C2
        <LI>C3
        <LI>C4
      </OL>
    </NEST>
    <OL>
      <LI>D1
      <LI>D2
      <LI>D3
      <LI>D4
    </OL>
  </NEST>
</body>
EOF

a = TextTagIterator.new(stext)
a.first("body") do |y|
  y.nth("dl",2) do |dl|
    dl.enumtag("dt") do |t|
      puts t.text.strip
    end
  end
  y.first("nest") do |n|
    n.first("p") do |c| 
      print c.text, ' '
      puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
    end.next("nest") do |m|
      m.first("p") do |c| 
        puts c.text
      end.next("ol") do |o|
        o.enumtag("li") do |i| puts i.text.strip end
      end
    end.next("ol") do |o|
      o.enumtag("li") do |i| puts i.text.strip end
    end
  end
end
a.each_block("sub") do |y|
  puts y.text.strip
end

produces

B1
B2
B3
TOP align=R
SECOND
C1
C2
C3
C4
D1
D2
D3
D4
S1
S2

Author

*ɂႷ <[email protected]>

Copyright (c) 2000 Ⴗ <[email protected]>

History

  • 2004/11/30 added to collection

  • 2000/09/18 made attribute name case-ignored.

  • 2000/06/06 added a new method : nth_trailer(tag,n).

    fixed regexp of tagnext method.
    
  • 2000/06/05 added new methods for information. tagexist?(tag) and tagnext.

    fixed bugs on return values of each_block
    
  • 2000/06/01 fixed nth method which sometimes crashes at the end-of-file.

    (reported by Matsui-san)
    
  • 2000/05/08 fixed enumtag which yields only half of texts in some conditions

  • 2000/04/27 regexp debugged in parse_attribute

  • 2000/03/12 started

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#attributesObject (readonly)

Returns the value of attribute attributes.



117
118
119
# File 'lib/carat/tagiter.rb', line 117

def attributes
  @attributes
end

#optionObject (readonly)

Returns the value of attribute option.



115
116
117
# File 'lib/carat/tagiter.rb', line 115

def option
  @option
end

#tagObject (readonly)

Returns the value of attribute tag.



116
117
118
# File 'lib/carat/tagiter.rb', line 116

def tag
  @tag
end

#textObject (readonly)

Returns the value of attribute text.



114
115
116
# File 'lib/carat/tagiter.rb', line 114

def text
  @text
end

Class Method Details

.[](aname) ⇒ Object



126
127
128
# File 'lib/carat/tagiter.rb', line 126

def @attributes.[](aname)
  super aname.downcase
end

Instance Method Details

#collect(*arg) ⇒ Object



227
228
229
230
231
# File 'lib/carat/tagiter.rb', line 227

def collect(*arg)
  a=[]
  each_block(*arg) do |tt| a.push tt end
  a
end

#each_block(tag, closetag = nil) ⇒ Object

Raises:

  • (RuntimeError)


203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/carat/tagiter.rb', line 203

def each_block(tag,closetag=nil)
  t=0
  s,d =find_opentag(tag)
  raise RuntimeError,"tag(#{tag}) not found" unless s

  while s do
    if closetag then
      e=find_closetag(closetag,s,tag)
    else
      e=find_closetag(tag,s)
    end
    e=-1 unless e
    yield self.class.new(@text[s..e],tag,parse_attribute(d))
    if e>=0 then 
      t=@text.index('>',e+1)
      t=@text.length unless t
      s,d = find_opentag(tag,t)
    else
      s=false
    end
  end
  self.class.new(text[t+1..-1])
end

#enumcollect(tag) ⇒ Object



243
244
245
246
247
# File 'lib/carat/tagiter.rb', line 243

def enumcollect(tag)
  a=[]
  enumtag(tag) do |t| a.push t end
  a
end

#enumtag(tag) ⇒ Object



233
234
235
236
237
238
239
240
241
# File 'lib/carat/tagiter.rb', line 233

def enumtag(tag)
  s,d = find_openenumtag(tag)
  while s do
    e=find_closeenumtag(tag,s+1)
    e=-1 unless e
    yield self.class.new(@text[s..e],tag,parse_attribute(d))
    s,d = find_openenumtag(tag,s)
  end
end

#first(tag, *arg) ⇒ Object Also known as: next



200
# File 'lib/carat/tagiter.rb', line 200

def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end

#for_this {|_self| ... } ⇒ Object

Yields:

  • (_self)

Yield Parameters:



249
250
251
# File 'lib/carat/tagiter.rb', line 249

def for_this
  yield self
end

#get_first(*arg) ⇒ Object



254
# File 'lib/carat/tagiter.rb', line 254

def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end

#get_nth(*arg) ⇒ Object



253
# File 'lib/carat/tagiter.rb', line 253

def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end

#nth(tag, n, closetag = nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ... } ⇒ Object

Yields:

  • (self.class.new(text[s..e],tag,parse_attribute(d)))

Raises:

  • (RuntimeError)


177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/carat/tagiter.rb', line 177

def nth(tag,n,closetag=nil)
  raise RuntimeError,"nth: number not specified" unless n
  t=0
  e=s=0   # for their scope
  d=nil

  1.upto(n) do |i|
    s,d = find_opentag(tag,t)
    raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s

    if closetag then
      e=find_closetag(closetag,s,tag)
    else
      e=find_closetag(tag,s)
    end
    e=-1 unless e
    t=@text.index('>',e+1)
    t=@text.length unless t
  end
  yield self.class.new(text[s..e],tag,parse_attribute(d))
  self.class.new(text[t+1..-1])
end

#nth_tailer(tag, n) ⇒ Object



269
270
271
# File 'lib/carat/tagiter.rb', line 269

def nth_tailer(tag,n)
  nth(tag,n) do end
end

#tagexist?(tag, st = 0) ⇒ Boolean

Returns:

  • (Boolean)


256
257
258
259
# File 'lib/carat/tagiter.rb', line 256

def tagexist?(tag,st=0)
  s=find_element(tag,st)
  if s then true else false end
end

#tagnextObject



261
262
263
264
265
266
267
# File 'lib/carat/tagiter.rb', line 261

def tagnext
  s=@text.index("<")
  return nil unless s
  e=@text.index(">",s)
  return nil unless s
  @text[s..e].scan(/[^<>\s]+/)[0]
end