Class: REXML::Parsers::BaseParserWithDoctypeFix

Inherits:

Object

Object
REXML::Parsers::BaseParserWithDoctypeFix

show all

Defined in:: lib/rexml/parsers/baseparser_with_doctype_fix.rb

Overview

Using the Pull Parser

This API is experimental, and subject to change.

parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
  res = parser.next
  puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
  res = parser.next
  raise res[1] if res.error?
end

Nat Price gave me some good ideas for the API.

Constant Summary collapse

NCNAME_STR =

'[\w:][\-\w\d.]*'

NAME_STR =

"(?:#{NCNAME_STR}:)?#{NCNAME_STR}"

NAMECHAR =

'[\-\w\d\.:]'

NAME =

"([\\w:]#{NAMECHAR}*)"

NMTOKEN =

"(?:#{NAMECHAR})+"

NMTOKENS =

"#{NMTOKEN}(\\s+#{NMTOKEN})*"

REFERENCE =

"(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"

REFERENCE_RE =

/#{REFERENCE}/

DOCTYPE_START =

/\A\s*<!DOCTYPE\s/um

DOCTYPE_PATTERN =

/\s*<!DOCTYPE\s+(.*?)(\[|>)/um

ATTRIBUTE_PATTERN =

/\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um

COMMENT_START =

/\A<!--/u

COMMENT_PATTERN =

/<!--(.*?)-->/um

CDATA_START =

/\A<!\[CDATA\[/u

CDATA_END =

/^\s*\]\s*>/um

CDATA_PATTERN =

/<!\[CDATA\[(.*?)\]\]>/um

XMLDECL_START =

/\A<\?xml\s/u

XMLDECL_PATTERN =

/<\?xml\s+(.*?)\?>/um

INSTRUCTION_START =

/\A<\?/u

INSTRUCTION_PATTERN =

/<\?(.*?)(\s+.*?)?\?>/um

TAG_MATCH =

/^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um

CLOSE_MATCH =

/^\s*<\/(#{NAME_STR})\s*>/um

VERSION =

/\bversion\s*=\s*["'](.*?)['"]/um

ENCODING =

/\bencoding=["'](.*?)['"]/um

STANDALONE =

/\bstandalone=["'](.*?)['"]/um

ENTITY_START =

/^\s*<!ENTITY/

IDENTITY =

/^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u

ELEMENTDECL_START =

/^\s*<!ELEMENT/um

ELEMENTDECL_PATTERN =

/^\s*(<!ELEMENT.*?)>/um

SYSTEMENTITY =

/^\s*(%.*?;)\s*$/um

ENUMERATION =

"\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"

NOTATIONTYPE =

"NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"

ENUMERATEDTYPE =

"(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"

ATTTYPE =

"(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"

ATTVALUE =

"(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"

DEFAULTDECL =

"(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"

ATTDEF =

"\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"

ATTDEF_RE =

/#{ATTDEF}/

ATTLISTDECL_START =

/^\s*<!ATTLIST/um

ATTLISTDECL_PATTERN =

/^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um

NOTATIONDECL_START =

/^\s*<!NOTATION/um

PUBLIC =

/^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um

SYSTEM =

/^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um

TEXT_PATTERN =

/\A([^<]*)/um

PUBIDCHAR = Entity constants

"\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"

SYSTEMLITERAL =

%Q{((?:"[^"]*")|(?:'[^']*'))}

PUBIDLITERAL =

%Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}

EXTERNALID =

"(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"

NDATADECL =

"\\s+NDATA\\s+#{NAME}"

PEREFERENCE =

"%#{NAME};"

ENTITYVALUE =

%Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}

PEDEF =

"(?:#{ENTITYVALUE}|#{EXTERNALID})"

ENTITYDEF =

"(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"

PEDECL =

"<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"

GEDECL =

"<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"

ENTITYDECL =

/\s*(?:#{GEDECL})|(?:#{PEDECL})/um

EREFERENCE =

/&(?!#{NAME};)/

DEFAULT_ENTITIES =

{
  'gt' => [/&gt;/, '&gt;', '>', />/],
  'lt' => [/&lt;/, '&lt;', '<', /</],
  'quot' => [/&quot;/, '&quot;', '"', /"/],
  "apos" => [/&apos;/, "&apos;", "'", /'/]
}

MISSING_ATTRIBUTE_QUOTES = These are patterns to identify common markup errors, to make the error messages more informative.

/^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um

Instance Attribute Summary collapse

#source ⇒ Object readonly

Returns the value of attribute source.

Instance Method Summary collapse

#add_listener(listener) ⇒ Object
#empty? ⇒ Boolean

Returns true if there are no more events.
#entity(reference, entities) ⇒ Object
#has_next? ⇒ Boolean

Returns true if there are more events.
#initialize(source) ⇒ BaseParserWithDoctypeFix constructor

A new instance of BaseParserWithDoctypeFix.
#normalize(input, entities = nil, entity_filter = nil) ⇒ Object

Escapes all possible entities.
#peek(depth = 0) ⇒ Object

Peek at the depth event in the stack.
#position ⇒ Object
#pull ⇒ Object

Returns the next event.
#stream=(source) ⇒ Object
#unnormalize(string, entities = nil, filter = nil) ⇒ Object

Unescapes all possible entities.
#unshift(token) ⇒ Object

Push an event back on the head of the stream.

Constructor Details

#initialize(source) ⇒ `BaseParserWithDoctypeFix`

Returns a new instance of BaseParserWithDoctypeFix.



112
113
114

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 112

def initialize( source )
  self.stream = source
end

Instance Attribute Details

#source ⇒ `Object` (readonly)

Returns the value of attribute source.



133
134
135

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 133

def source
  @source
end

Instance Method Details

#add_listener(listener) ⇒ `Object`

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 116

def add_listener( listener )
  if !defined?(@listeners) or !@listeners
    @listeners = []
    instance_eval "      alias :_old_pull :pull\n      def pull\n        event = _old_pull\n        @listeners.each do |listener|\n          listener.receive event\n        end\n        event\n      end\n    EOL\n  end\n  @listeners << listener\nend\n"

#empty? ⇒ `Boolean`

Returns true if there are no more events

Returns:

(Boolean)

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 154

def empty?
  #STDERR.puts "@source.empty? = #{@source.empty?}"
  #STDERR.puts "@stack.empty? = #{@stack.empty?}"
  return (@source.empty? and @stack.empty?)
end

#entity(reference, entities) ⇒ `Object`

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 393

def entity( reference, entities )
  value = nil
  value = entities[ reference ] if entities
  if not value
    value = DEFAULT_ENTITIES[ reference ]
    value = value[2] if value
  end
  unnormalize( value, entities ) if value
end

#has_next? ⇒ `Boolean`

Returns true if there are more events. Synonymous with !empty?

Returns:

(Boolean)



161
162
163

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 161

def has_next?
  return !(@source.empty? and @stack.empty?)
end

#normalize(input, entities = nil, entity_filter = nil) ⇒ `Object`

Escapes all possible entities

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 404

def normalize( input, entities=nil, entity_filter=nil )
  copy = input.clone
  # Doing it like this rather than in a loop improves the speed
  copy.gsub!( EREFERENCE, '&amp;' )
  entities.each do |key, value|
    copy.gsub!( value, "&#{key};" ) unless entity_filter and
                                entity_filter.include?(entity)
  end if entities
  copy.gsub!( EREFERENCE, '&amp;' )
  DEFAULT_ENTITIES.each do |key, value|
    copy.gsub!( value[3], value[1] )
  end
  copy
end

#peek(depth = 0) ⇒ `Object`

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 177

def peek depth=0
  raise %Q[Illegal argument "#{depth}"] if depth < -1
  temp = []
  if depth == -1
    temp.push(pull()) until empty?
  else
    while @stack.size+temp.size < depth+1
      temp.push(pull())
    end
  end
  @stack += temp if temp.size > 0
  @stack[depth]
end

#position ⇒ `Object`

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 144

def position
  if @source.respond_to? :position
    @source.position
  else
    # FIXME
    0
  end
end

#pull ⇒ `Object`

Returns the next event. This is a PullEvent object.

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 192

def pull
  if @closed
    x, @closed = @closed, nil
    return [ :end_element, x ]
  end
  return [ :end_document ] if empty?
  return @stack.shift if @stack.size > 0
  @source.read if @source.buffer.size<2
  #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
  if @document_status == nil
    #@source.consume( /^\s*/um )
    word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
    word = word[1] unless word.nil?
    #STDERR.puts "WORD = #{word.inspect}"
    case word
    when COMMENT_START
      return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
    when XMLDECL_START
      #STDERR.puts "XMLDECL"
      results = @source.match( XMLDECL_PATTERN, true )[1]
      version = VERSION.match( results )
      version = version[1] unless version.nil?
      encoding = ENCODING.match(results)
      encoding = encoding[1] unless encoding.nil?
      @source.encoding = encoding
      standalone = STANDALONE.match(results)
      standalone = standalone[1] unless standalone.nil?
      return [ :xmldecl, version, encoding, standalone ]
    when INSTRUCTION_START
      return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
    when DOCTYPE_START
      md = @source.match( DOCTYPE_PATTERN, true )
      identity = md[1]
      close = md[2]
      identity =~ IDENTITY
      name = $1
      raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
      pub_sys = $2.nil? ? nil : $2.strip
      long_name = $4.nil? ? nil : $4.strip
      uri = $6.nil? ? nil : $6.strip
      args = [ :start_doctype, name, pub_sys, long_name, uri ]
      if close == ">"
        @document_status = :after_doctype
        @source.read if @source.buffer.size<2
        md = @source.match(/^\s*/um, true)
        @stack << [ :end_doctype ]
      else
        @document_status = :in_doctype
      end
      return args
    when /^\s+/
    else
      @document_status = :after_doctype
      @source.read if @source.buffer.size<2
      md = @source.match(/\s*/um, true)
    end
  end
  if @document_status == :in_doctype
    md = @source.match(/\s*(.*?>)/um)
    case md[1]
    when SYSTEMENTITY
      match = @source.match( SYSTEMENTITY, true )[1]
      return [ :externalentity, match ]

    when ELEMENTDECL_START
      return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

    when ENTITY_START
      match = @source.match( ENTITYDECL, true ).to_a.compact
      match[0] = :entitydecl
      ref = false
      if match[1] == '%'
        ref = true
        match.delete_at 1
      end
      # Now we have to sort out what kind of entity reference this is
      if match[2] == 'SYSTEM'
        # External reference
        match[3] = match[3][1..-2] # PUBID
        match.delete_at(4) if match.size > 4 # Chop out NDATA decl
        # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
      elsif match[2] == 'PUBLIC'
        # External reference
        match[3] = match[3][1..-2] # PUBID
        match[4] = match[4][1..-2] # HREF
        # match is [ :entity, name, PUBLIC, pubid, href ]
      else
        match[2] = match[2][1..-2]
        match.pop if match.size == 4
        # match is [ :entity, name, value ]
      end
      match << '%' if ref
      return match
    when ATTLISTDECL_START
      md = @source.match( ATTLISTDECL_PATTERN, true )
      raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
      element = md[1]
      contents = md[0]

      pairs = {}
      values = md[0].scan( ATTDEF_RE )
      values.each do |attdef|
        unless attdef[3] == "#IMPLIED"
          attdef.compact!
          val = attdef[3]
          val = attdef[4] if val == "#FIXED "
          pairs[attdef[0]] = val
        end
      end
      return [ :attlistdecl, element, pairs, contents ]
    when NOTATIONDECL_START
      md = nil
      if @source.match( PUBLIC )
        md = @source.match( PUBLIC, true )
        vals = [md[1],md[2],md[4],md[6]]
      elsif @source.match( SYSTEM )
        md = @source.match( SYSTEM, true )
        vals = [md[1],md[2],nil,md[4]]
      else
        raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
      end
      return [ :notationdecl, *vals ]
    when CDATA_END
      @document_status = :after_doctype
      @source.match( CDATA_END, true )
      return [ :end_doctype ]
    end
  end
  begin
    if @source.buffer[0] == ?<
      if @source.buffer[1] == ?/
        last_tag = @tags.pop
        #md = @source.match_to_consume( '>', CLOSE_MATCH)
        md = @source.match( CLOSE_MATCH, true )
        raise REXML::ParseException.new( "Missing end tag for "+
          "'#{last_tag}' (got \"#{md[1]}\")",
          @source) unless last_tag == md[1]
        return [ :end_element, last_tag ]
      elsif @source.buffer[1] == ?!
        md = @source.match(/\A(\s*[^>]*>)/um)
        #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
        raise REXML::ParseException.new("Malformed node", @source) unless md
        if md[0][2] == ?-
          md = @source.match( COMMENT_PATTERN, true )
          return [ :comment, md[1] ] if md
        else
          md = @source.match( CDATA_PATTERN, true )
          return [ :cdata, md[1] ] if md
        end
        raise REXML::ParseException.new( "Declarations can only occur "+
          "in the doctype declaration.", @source)
      elsif @source.buffer[1] == ??
        md = @source.match( INSTRUCTION_PATTERN, true )
        return [ :processing_instruction, md[1], md[2] ] if md
        raise REXML::ParseException.new( "Bad instruction declaration",
          @source)
      else
        # Get the next tag
        md = @source.match(TAG_MATCH, true)
        unless md
          # Check for missing attribute quotes
          raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
          raise REXML::ParseException.new("malformed XML: missing tag start", @source)
        end
        attrs = []
        if md[2].size > 0
          attrs = md[2].scan( ATTRIBUTE_PATTERN )
          raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
        end

        if md[4]
          @closed = md[1]
        else
          @tags.push( md[1] )
        end
        attributes = {}
        attrs.each { |a,b,c| attributes[a] = c }
        return [ :start_element, md[1], attributes ]
      end
    else
      md = @source.match( TEXT_PATTERN, true )
      if md[0].length == 0
        puts "EMPTY = #{empty?}"
        puts "BUFFER = \"#{@source.buffer}\""
        @source.match( /(\s+)/, true )
      end
      #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
      #return [ :text, "" ] if md[0].length == 0
      # unnormalized = Text::unnormalize( md[1], self )
      # return PullEvent.new( :text, md[1], unnormalized )
      return [ :text, md[1] ]
    end
  rescue REXML::ParseException
    raise
  rescue Exception, NameError => error
    raise REXML::ParseException.new( "Exception parsing",
      @source, self, (error ? error : $!) )
  end
  return [ :dummy ]
end

#stream=(source) ⇒ `Object`

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 135

def stream=( source )
  @source = SourceFactory.create_from( source )
  @closed = nil
  @document_status = nil
  @tags = []
  @stack = []
  @entities = []
end

#unnormalize(string, entities = nil, filter = nil) ⇒ `Object`

Unescapes all possible entities

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 420

def unnormalize( string, entities=nil, filter=nil )
  rv = string.clone
  rv.gsub!( /\r\n?/, "\n" )
  matches = rv.scan( REFERENCE_RE )
  return rv if matches.size == 0
  rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
    m=$1
    m = "0#{m}" if m[0] == ?x
    [Integer(m)].pack('U*')
  }
  matches.collect!{|x|x[0]}.compact!
  if matches.size > 0
    matches.each do |entity_reference|
      unless filter and filter.include?(entity_reference)
        entity_value = entity( entity_reference, entities )
        if entity_value
          re = /&#{entity_reference};/
          rv.gsub!( re, entity_value )
        end
      end
    end
    matches.each do |entity_reference|
      unless filter and filter.include?(entity_reference)
        er = DEFAULT_ENTITIES[entity_reference]
        rv.gsub!( er[0], er[2] ) if er
      end
    end
    rv.gsub!( /&amp;/, '&' )
  end
  rv
end

#unshift(token) ⇒ `Object`

Push an event back on the head of the stream. This method has (theoretically) infinite depth.



167
168
169

# File 'lib/rexml/parsers/baseparser_with_doctype_fix.rb', line 167

def unshift token
  @stack.unshift(token)
end

Class: REXML::Parsers::BaseParserWithDoctypeFix

Overview

Using the Pull Parser

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ BaseParserWithDoctypeFix

Instance Attribute Details

#source ⇒ Object (readonly)

Instance Method Details

#add_listener(listener) ⇒ Object

#empty? ⇒ Boolean

#entity(reference, entities) ⇒ Object

#has_next? ⇒ Boolean

#normalize(input, entities = nil, entity_filter = nil) ⇒ Object

#peek(depth = 0) ⇒ Object

#position ⇒ Object

#pull ⇒ Object

#stream=(source) ⇒ Object

#unnormalize(string, entities = nil, filter = nil) ⇒ Object

#unshift(token) ⇒ Object