Class: TextTagIterator

Inherits:

Object

Object
TextTagIterator

show all

Defined in:: lib/carat/tagiter.rb

Overview

Tagiter

Simple but very useful HTML/XHTML cascading parser for those quick and dirty web page parse jobs.

Usage

# sample html
stext = <<-EOF
<body> This is a test...
  <sub> S1 </sub> <sub> S2 </sub>
  <DL>
    <DT> A1
    <DT> A2
    <DT> A3
  </DL>
  <DL>
    <DT> B1
    <DT> B2
    <DT> B3
  </DL>
  <NEST>
    <P ALIGN="R">TOP</P>
    <NEST>
      <P>SECOND</P>
      <OL>
        <LI>C1
        <LI>C2
        <LI>C3
        <LI>C4
      </OL>
    </NEST>
    <OL>
      <LI>D1
      <LI>D2
      <LI>D3
      <LI>D4
    </OL>
  </NEST>
</body>
EOF

a = TextTagIterator.new(stext)
a.first("body") do |y|
  y.nth("dl",2) do |dl|
    dl.enumtag("dt") do |t|
      puts t.text.strip
    end
  end
  y.first("nest") do |n|
    n.first("p") do |c| 
      print c.text, ' '
      puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
    end.next("nest") do |m|
      m.first("p") do |c| 
        puts c.text
      end.next("ol") do |o|
        o.enumtag("li") do |i| puts i.text.strip end
      end
    end.next("ol") do |o|
      o.enumtag("li") do |i| puts i.text.strip end
    end
  end
end
a.each_block("sub") do |y|
  puts y.text.strip
end

produces

B1
B2
B3
TOP align=R
SECOND
C1
C2
C3
C4
D1
D2
D3
D4
S1
S2

Author

*ɂႷ <[email protected]>

Legal

Copyright (c) 2000 Ⴗ <[email protected]>

History

2004/11/30 added to collection
2000/09/18 made attribute name case-ignored.
2000/06/06 added a new method : nth_trailer(tag,n).
```
fixed regexp of tagnext method.
```
2000/06/05 added new methods for information. tagexist?(tag) and tagnext.
```
fixed bugs on return values of each_block
```
2000/06/01 fixed nth method which sometimes crashes at the end-of-file.
```
(reported by Matsui-san)
```
2000/05/08 fixed enumtag which yields only half of texts in some conditions
2000/04/27 regexp debugged in parse_attribute
2000/03/12 started

Instance Attribute Summary collapse

#attributes ⇒ Object readonly

Returns the value of attribute attributes.
#option ⇒ Object readonly

Returns the value of attribute option.
#tag ⇒ Object readonly

Returns the value of attribute tag.
#text ⇒ Object readonly

Returns the value of attribute text.

Class Method Summary collapse

.[](aname) ⇒ Object

Instance Method Summary collapse

Instance Attribute Details

#attributes ⇒ `Object` (readonly)

Returns the value of attribute attributes.



117
118
119

# File 'lib/carat/tagiter.rb', line 117

def attributes
  @attributes
end

#option ⇒ `Object` (readonly)

Returns the value of attribute option.



115
116
117

# File 'lib/carat/tagiter.rb', line 115

def option
  @option
end

#tag ⇒ `Object` (readonly)

Returns the value of attribute tag.



116
117
118

# File 'lib/carat/tagiter.rb', line 116

def tag
  @tag
end

#text ⇒ `Object` (readonly)

Returns the value of attribute text.



114
115
116

# File 'lib/carat/tagiter.rb', line 114

def text
  @text
end

Class Method Details

.[](aname) ⇒ `Object`



126
127
128

# File 'lib/carat/tagiter.rb', line 126

def @attributes.[](aname)
  super aname.downcase
end

Instance Method Details

#collect(*arg) ⇒ `Object`

# File 'lib/carat/tagiter.rb', line 227

def collect(*arg)
  a=[]
  each_block(*arg) do |tt| a.push tt end
  a
end

#each_block(tag, closetag = nil) ⇒ `Object`

Raises:

(RuntimeError)

# File 'lib/carat/tagiter.rb', line 203

def each_block(tag,closetag=nil)
  t=0
  s,d =find_opentag(tag)
  raise RuntimeError,"tag(#{tag}) not found" unless s

  while s do
    if closetag then
      e=find_closetag(closetag,s,tag)
    else
      e=find_closetag(tag,s)
    end
    e=-1 unless e
    yield self.class.new(@text[s..e],tag,parse_attribute(d))
    if e>=0 then 
      t=@text.index('>',e+1)
      t=@text.length unless t
      s,d = find_opentag(tag,t)
    else
      s=false
    end
  end
  self.class.new(text[t+1..-1])
end

#enumcollect(tag) ⇒ `Object`

# File 'lib/carat/tagiter.rb', line 243

def enumcollect(tag)
  a=[]
  enumtag(tag) do |t| a.push t end
  a
end

#enumtag(tag) ⇒ `Object`

# File 'lib/carat/tagiter.rb', line 233

def enumtag(tag)
  s,d = find_openenumtag(tag)
  while s do
    e=find_closeenumtag(tag,s+1)
    e=-1 unless e
    yield self.class.new(@text[s..e],tag,parse_attribute(d))
    s,d = find_openenumtag(tag,s)
  end
end

#first(tag, *arg) ⇒ `Object` Also known as: next

200	# File 'lib/carat/tagiter.rb', line 200 def first(tag,arg) nth(tag,1,arg) do \|f\| yield f end end

#for_this {|_self| ... } ⇒ `Object`

Yields:

(_self)

Yield Parameters:

_self (TextTagIterator) —

the object that the method was called on



249
250
251

# File 'lib/carat/tagiter.rb', line 249

def for_this
  yield self
end

#get_first(*arg) ⇒ `Object`

254	# File 'lib/carat/tagiter.rb', line 254 def get_first(arg) r=nil; first(arg) do \|bl\| r=bl end; r; end

#get_nth(*arg) ⇒ `Object`

253	# File 'lib/carat/tagiter.rb', line 253 def get_nth(arg) r=nil; nth(arg) do \|bl\| r=bl end; r; end

#nth(tag, n, closetag = nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ... } ⇒ `Object`

Yields:

(self.class.new(text[s..e],tag,parse_attribute(d)))

Raises:

(RuntimeError)

# File 'lib/carat/tagiter.rb', line 177

def nth(tag,n,closetag=nil)
  raise RuntimeError,"nth: number not specified" unless n
  t=0
  e=s=0   # for their scope
  d=nil

  1.upto(n) do |i|
    s,d = find_opentag(tag,t)
    raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s

    if closetag then
      e=find_closetag(closetag,s,tag)
    else
      e=find_closetag(tag,s)
    end
    e=-1 unless e
    t=@text.index('>',e+1)
    t=@text.length unless t
  end
  yield self.class.new(text[s..e],tag,parse_attribute(d))
  self.class.new(text[t+1..-1])
end

#nth_tailer(tag, n) ⇒ `Object`



269
270
271

# File 'lib/carat/tagiter.rb', line 269

def nth_tailer(tag,n)
  nth(tag,n) do end
end

#tagexist?(tag, st = 0) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/carat/tagiter.rb', line 256

def tagexist?(tag,st=0)
  s=find_element(tag,st)
  if s then true else false end
end

#tagnext ⇒ `Object`

# File 'lib/carat/tagiter.rb', line 261

def tagnext
  s=@text.index("<")
  return nil unless s
  e=@text.index(">",s)
  return nil unless s
  @text[s..e].scan(/[^<>\s]+/)[0]
end

Class: TextTagIterator

Overview

Tagiter

Usage

Author

Legal

History

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#attributes ⇒ Object (readonly)

#option ⇒ Object (readonly)

#tag ⇒ Object (readonly)

#text ⇒ Object (readonly)

Class Method Details

.[](aname) ⇒ Object

Instance Method Details

#collect(*arg) ⇒ Object

#each_block(tag, closetag = nil) ⇒ Object

#enumcollect(tag) ⇒ Object

#enumtag(tag) ⇒ Object

#first(tag, *arg) ⇒ Object Also known as: next

#for_this {|_self| ... } ⇒ Object

#get_first(*arg) ⇒ Object

#get_nth(*arg) ⇒ Object

#nth(tag, n, closetag = nil) {|self.class.new(text[s..e],tag,parse_attribute(d))| ... } ⇒ Object

#nth_tailer(tag, n) ⇒ Object

#tagexist?(tag, st = 0) ⇒ Boolean

#tagnext ⇒ Object