Class: HTMLTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/openid/yadis/htmltokenizer.rb

Overview

A class to tokenize HTML.

Example:

page = "<HTML>
<HEAD>
<TITLE>This is the title</TITLE>
</HEAD>
 <!-- Here comes the <a href=\"missing.link\">blah</a>
 comment body
  -->
 <BODY>
   <H1>This is the header</H1>
   <P>
     This is the paragraph, it contains
     <a href=\"link.html\">links</a>,
     <img src=\"blah.gif\" optional alt='images
     are
     really cool'>.  Ok, here is some more text and
     <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
   </P>
 </body>
 </HTML>
 "
 toke = HTMLTokenizer.new(page)

 assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
 assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
 assert("links" == toke.getTrimmedText)
 assert(toke.getTag("IMG", "A").attr_hash['optional'])
 assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])

Constant Summary collapse

@@version =
1.0

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content) ⇒ HTMLTokenizer

Create a new tokenizer, based on the content, used as a string.



57
58
59
60
# File 'lib/openid/yadis/htmltokenizer.rb', line 57

def initialize(content)
  @page = content.to_s
  @cur_pos = 0
end

Instance Attribute Details

#pageObject (readonly)

Returns the value of attribute page.



54
55
56
# File 'lib/openid/yadis/htmltokenizer.rb', line 54

def page
  @page
end

Class Method Details

.versionObject

Get version of HTMLTokenizer lib



50
51
52
# File 'lib/openid/yadis/htmltokenizer.rb', line 50

def self.version
  @@version
end

Instance Method Details

#getNextTokenObject

Get the next token, returns an instance of

  • HTMLText

  • HTMLToken

  • HTMLTag



103
104
105
106
107
108
109
110
111
112
113
# File 'lib/openid/yadis/htmltokenizer.rb', line 103

def getNextToken
  token = peekNextToken
  if token
    # @page = @page[token.raw.length .. -1]
    # @page.slice!(0, token.raw.length)
    @cur_pos += token.raw.length
  end
  #p token
  #print token.raw
  return token
end

#getTag(*sought_tags) ⇒ Object

Get a tag from the specified set of desired tags. For example: foo = toke.getTag("h1", "h2", "h3") Will return the next header tag encountered.



119
120
121
122
123
124
125
126
127
128
129
# File 'lib/openid/yadis/htmltokenizer.rb', line 119

def getTag(*sought_tags)
  sought_tags.collect! {|elm| elm.downcase}

  while (tag = getNextToken)
    if tag.kind_of?(HTMLTag) and
        (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
      break
    end
  end
  tag
end

#getText(until_tag = nil) ⇒ Object

Get all the text between the current position and the next tag (if specified) or a specific later tag



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/openid/yadis/htmltokenizer.rb', line 133

def getText(until_tag = nil)
  if until_tag.nil?
    if ?< == @page[@cur_pos]
      # Next token is a tag, not text
      ""
    else
      # Next token is text
      getNextToken.text
    end
  else
    ret_str = ""

    while (tag = peekNextToken)
      if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
        break
      end

      if ("" != tag.text)
        ret_str << (tag.text + " ")
      end
      getNextToken
    end

    ret_str
  end
end

#getTrimmedText(until_tag = nil) ⇒ Object

Like getText, but squeeze all whitespace, getting rid of leading and trailing whitespace, and squeezing multiple spaces into a single space.



163
164
165
# File 'lib/openid/yadis/htmltokenizer.rb', line 163

def getTrimmedText(until_tag = nil)
  getText(until_tag).strip.gsub(/\s+/m, " ")
end

#peekNextTokenObject

Look at the next token, but don’t actually grab it



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/openid/yadis/htmltokenizer.rb', line 68

def peekNextToken
  if @cur_pos == @page.length then return nil end

  if ?< == @page[@cur_pos]
    # Next token is a tag of some kind
    if '!--' == @page[(@cur_pos + 1), 3]
      # Token is a comment
      tag_end = @page.index('-->', (@cur_pos + 1))
      if tag_end.nil?
        raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
      end
      # p @page[@cur_pos .. (tag_end+2)]
      HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
    else
      # Token is a html tag
      tag_end = @page.index('>', (@cur_pos + 1))
      if tag_end.nil?
        raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
      end
      # p @page[@cur_pos .. tag_end]
      HTMLTag.new(@page[@cur_pos .. tag_end])
    end
  else
    # Next token is text
    text_end = @page.index('<', @cur_pos)
    text_end = text_end.nil? ? -1 : (text_end - 1)
    # p @page[@cur_pos .. text_end]
    HTMLText.new(@page[@cur_pos .. text_end])
  end
end

#resetObject

Reset the parser, setting the current position back at the stop



63
64
65
# File 'lib/openid/yadis/htmltokenizer.rb', line 63

def reset
  @cur_pos = 0
end