Class: HexaPDF::Content::Tokenizer

Inherits:
Tokenizer
  • Object
show all
Defined in:
lib/hexapdf/content/parser.rb

Overview

More efficient tokenizer for content streams. This tokenizer class works directly on a string and not on an IO.

Note: Indirect object references are not supported by this tokenizer!

See: PDF1.7 s7.2

Constant Summary

Constants inherited from Tokenizer

Tokenizer::DELIMITER, Tokenizer::NO_MORE_TOKENS, Tokenizer::TOKEN_ARRAY_END, Tokenizer::TOKEN_ARRAY_START, Tokenizer::TOKEN_DICT_END, Tokenizer::TOKEN_DICT_START, Tokenizer::WHITESPACE, Tokenizer::WHITESPACE_MULTI_RE, Tokenizer::WHITESPACE_OR_DELIMITER_RE

Instance Attribute Summary collapse

Attributes inherited from Tokenizer

#io

Instance Method Summary collapse

Methods inherited from Tokenizer

#next_byte, #next_object, #next_xref_entry, #peek_token, #skip_whitespace

Constructor Details

#initialize(string) ⇒ Tokenizer

Creates a new tokenizer.



53
54
55
56
# File 'lib/hexapdf/content/parser.rb', line 53

def initialize(string)
  @ss = StringScanner.new(string)
  @string = string
end

Instance Attribute Details

#stringObject (readonly)

The string that is tokenized.



50
51
52
# File 'lib/hexapdf/content/parser.rb', line 50

def string
  @string
end

Instance Method Details

#next_tokenObject

See: HexaPDF::Tokenizer#next_token



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/hexapdf/content/parser.rb', line 74

def next_token
  @ss.skip(WHITESPACE_MULTI_RE)
  byte = @string.getbyte(@ss.pos) || -1
  if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - +  .
    parse_number
  elsif (65 <= byte && byte <= 90) || (96 <= byte && byte <= 121)
    parse_keyword
  elsif byte == 47 # /
    parse_name
  elsif byte == 40 # (
    parse_literal_string
  elsif byte == 60 # <
    if @string.getbyte(@ss.pos + 1) != 60
      parse_hex_string
    else
      @ss.pos += 2
      TOKEN_DICT_START
    end
  elsif byte == 62 # >
    unless @string.getbyte(@ss.pos + 1) == 62
      raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos)
    end
    @ss.pos += 2
    TOKEN_DICT_END
  elsif byte == 91 # [
    @ss.pos += 1
    TOKEN_ARRAY_START
  elsif byte == 93 # ]
    @ss.pos += 1
    TOKEN_ARRAY_END
  elsif byte == 123 || byte == 125 # { }
    Token.new(@ss.get_byte)
  elsif byte == 37 # %
    return NO_MORE_TOKENS unless @ss.skip_until(/(?=[\r\n])/)
    next_token
  elsif byte == -1
    NO_MORE_TOKENS
  else
    parse_keyword
  end
end

#posObject

See: HexaPDF::Tokenizer#pos



59
60
61
# File 'lib/hexapdf/content/parser.rb', line 59

def pos
  @ss.pos
end

#pos=(pos) ⇒ Object

See: HexaPDF::Tokenizer#pos=



64
65
66
# File 'lib/hexapdf/content/parser.rb', line 64

def pos=(pos)
  @ss.pos = pos
end

#scan_until(re) ⇒ Object

See: HexaPDF::Tokenizer#scan_until



69
70
71
# File 'lib/hexapdf/content/parser.rb', line 69

def scan_until(re)
  @ss.scan_until(re)
end