Class: Syntax::XML

Inherits:
Tokenizer show all
Defined in:
lib/syntax/lang/xml.rb

Overview

A simple implementation of an XML lexer. It handles most cases. It is not a validating lexer, meaning it will happily process invalid XML without complaining.

Instance Attribute Summary

Attributes inherited from Tokenizer

#chunk, #group

Instance Method Summary collapse

Methods inherited from Tokenizer

#finish, #option, #set, #start, #teardown, #tokenize

Instance Method Details

#setupObject

Initialize the lexer.



11
12
13
# File 'lib/syntax/lang/xml.rb', line 11

def setup
  @in_tag = false
end

#stepObject

Step through a single iteration of the tokenization process. This will yield (potentially) many tokens, and possibly zero tokens.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/syntax/lang/xml.rb', line 17

def step
  start_group :normal, matched if scan( /\s+/ )
  if @in_tag
    case
      when scan( /([-\w]+):([-\w]+)/ )
        start_group :namespace, subgroup(1)
        start_group :punct, ":"
        start_group :attribute, subgroup(2)
      when scan( /\d+/ )
        start_group :number, matched
      when scan( /[-\w]+/ )
        start_group :attribute, matched
      when scan( %r{[/?]?>} )
        @in_tag = false
        start_group :punct, matched
      when scan( /=/ )
        start_group :punct, matched
      when scan( /["']/ )
        scan_string matched
      else
        append getch
    end
  elsif ( text = scan_until( /(?=[<&])/ ) )
    start_group :normal, text unless text.empty?
    if scan(/<!--.*?(-->|\Z)/m)
      start_group :comment, matched
    else
      case peek(1)
        when "<"
          start_group :punct, getch
          case peek(1)
            when "?"
              append getch
            when "/"
              append getch
            when "!"
              append getch
          end
          start_group :normal, matched if scan( /\s+/ )
          if scan( /([-\w]+):([-\w]+)/ )
            start_group :namespace, subgroup(1)
            start_group :punct, ":"
            start_group :tag, subgroup(2)
          elsif scan( /[-\w]+/ )
            start_group :tag, matched
          end
          @in_tag = true
        when "&"
          if scan( /&\S{1,10};/ )
            start_group :entity, matched
          else
            start_group :normal, scan( /&/ )
          end
      end
    end
  else
    append scan_until( /\Z/ )
  end
end