Class: XmlStreamParser

Inherits:
Object show all
Includes:
Sentinels
Defined in:
lib/xml_stream_parser.rb

Defined Under Namespace

Modules: Sentinels Classes: EndContext, Nothing, Sentinel

Constant Summary collapse

VERSION =
"0.2.0"

Constants included from Sentinels

Sentinels::END_CONTEXT, Sentinels::NOTHING

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#dslObject (readonly)

Returns the value of attribute dsl.



43
44
45
# File 'lib/xml_stream_parser.rb', line 43

def dsl
  @dsl
end

#pull_parserObject (readonly)

the REXML::Parsers::PullParser used internally



42
43
44
# File 'lib/xml_stream_parser.rb', line 42

def pull_parser
  @pull_parser
end

Instance Method Details

#discardObject

parse and throw away content until we escape the current context, either through end_element, or end_document



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/xml_stream_parser.rb', line 106

def discard()
  element_stack = []

  while(true)
    e = @pull_parser.peek
    name = e[0]
    if e.start_element?
      element_stack.push(name)
    elsif e.end_element?
      return nil if element_stack.size == 0
      raise "mismatched end_element. expected </#{element_stack.last}>, got: #{e.inspect}" if name != element_stack.last
      element_stack.pop
    elsif e.end_document?
      return nil if element_stack.size ==0
      raise "mismatched end_element. expected </#{element_stack.last}>, got: #{e.inspect}"
    end
    @pull_parser.pull
  end
end

#element(element_names, optional = false, &block) ⇒ Object

consume an element

  • if optional is false the element must be present

  • if optional is true and the element is not present then NOTHING/END_CONTEXT will be returned

  • consumes start_element, calls block on content, consumes end_element



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/xml_stream_parser.rb', line 131

def element( element_names, optional=false, &block )
  element_names = [ *element_names ]

  f = find_element(element_names)
  e = @pull_parser.peek

  if f.is_a? Sentinel
    if optional
      return f
    else
      raise "expected start element: <#{element_names.join('|')}, got: #{e.inspect}>"
    end
  end

  e = @pull_parser.pull # consume the start tag
  name = e[0]
  attrs = e[1]
  
  # block should consume all element content, and leave parser on end_element, or
  # whitespace before it
  err=false
  begin
    if self.dsl
      v = self.instance_exec(name, attrs, &block)
    else
      v = block.call(name,attrs)
    end
    return v if ! v.is_a? Sentinel # do not propagate Sentinels. they confuse callers
  rescue
    err=true  # note that we are erroring, so as not to mask the exception from ensure block
    raise
  ensure  
    if !err # if return was called in the block, ensure we consume the end_element
      e = @pull_parser.pull
      e = @pull_parser.pull if e.text? && e[0] =~ /[[:space:]]/
      raise "expected end tag: #{name}, got: #{e.inspect}" if ! e.end_element? || e[0] != name
    end
  end
end

#elements(element_names, &block) ⇒ Object

find and consume elements, calling block on each one found return result of last find : NOTHING or END_CONTEXT sentinel



173
174
175
176
177
178
179
# File 'lib/xml_stream_parser.rb', line 173

def elements( element_names, &block )
  while true
    break if element(element_names, true, &block).is_a? Sentinel
  end

  return nil
end

#find_element(element_names) ⇒ Object

find an element with name in element_names : inter-element whitespace is ignored

  • encountering end_element terminates and returns END_CONTEXT, leaving parser on end_element

  • encountering end_document terminates and returns END_CONTEXT

  • encountering start_element for an element not in element_names NOTHING, parser on start_element

  • encountering start_element for an element in element_names returns element name, parser on start_element



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/xml_stream_parser.rb', line 81

def find_element( element_names )
  element_names = [ *element_names ]

  while( true )
    e = @pull_parser.peek
    if e.start_element?
      if element_names.include?( e[0] )
        return e[0]
      else
        return NOTHING
      end
    elsif e.end_element?
      return END_CONTEXT
    elsif e.end_document?
      return END_CONTEXT
    elsif e.text? 
      # ignore whitespace between elements
      raise "unexpected text content: #{e.inspect}" if e[0] !~ /[[:space:]]/
      @pull_parser.pull
    end
  end
end

#parse(data, &block) ⇒ Object

parse retaining block context… permitting the parse to easily be split over multiple methods



47
48
49
# File 'lib/xml_stream_parser.rb', line 47

def parse(data, &block)
  parse_dsl(data, false, &block)
end

#parse_dsl(data, dsl = true, &block) ⇒ Object

parse with optional dsl mode if dsl is true [ default ] then the block will be instance_exec’d in the context of the parser, if dsl is false the block will be called retaining it’s current context



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/xml_stream_parser.rb', line 55

def parse_dsl(data, dsl=true, &block)
  io = case data
       when IO
         data
       when StringIO
         data
       when String
         StringIO.new(data)
       end

  @pull_parser = REXML::Parsers::PullParser.new( io )
  @dsl = dsl
  if self.dsl
    self.instance_exec(&block)
  else
    block.call(self)
  end
ensure
  @pull_parser = nil
end

#text(&block) ⇒ Object

consume text element returns the text, or nil if none



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/xml_stream_parser.rb', line 183

def text( &block )
  e = @pull_parser.peek
  raise "expected text node, got #{e.inspect}" if ! e.text? && ! e.end_element?
  text = if e.text?
           @pull_parser.pull
           e[0]
         else
           nil
         end
  if block
    if self.dsl
      text = self.instance_exec( text , &block)
    else
      text = block.call(text)
    end
  end
  text
end