Class: PDF::Reader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/parser.rb

Overview

An internal PDF::Reader class that reads objects from the PDF file and converts them into useable ruby objects (hash’s, arrays, true, false, etc)

Instance Method Summary collapse

Constructor Details

#initialize(buffer, xref) ⇒ Parser

Create a new parser around a PDF::Reader::Buffer object

buffer - a PDF::Reader::Buffer object that contains PDF data xref - a PDF::Reader::XRef object that represents the document’s object offsets



36
37
38
39
# File 'lib/pdf/reader/parser.rb', line 36

def initialize (buffer, xref)
  @buffer = buffer
  @xref   = xref
end

Instance Method Details

#arrayObject

reads a PDF array from the buffer and converts it to a Ruby Array.



87
88
89
90
91
92
93
94
95
96
97
# File 'lib/pdf/reader/parser.rb', line 87

def array
  a = []

  loop do
    item = parse_token
    break if item.kind_of?(Token) and item == "]"
    a << item
  end

  a
end

#dictionaryObject

reads a PDF dict from the buffer and converts it to a Ruby Hash.



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/pdf/reader/parser.rb', line 70

def dictionary
  dict = {}

  loop do
    key = parse_token
    break if key.kind_of?(Token) and key == ">>"
    raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)

    value = parse_token
    value.kind_of?(Token) and Error.str_assert_not(value, ">>")
    dict[key] = value
  end

  dict
end

#hex_stringObject

Reads a PDF hex string from the buffer and converts it to a Ruby String



100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/pdf/reader/parser.rb', line 100

def hex_string
  str = ""
  
  loop do
    token = @buffer.token
    break if token == ">"
    str << token
  end

  # add a missing digit if required, as required by the spec
  str << "0" unless str.size % 2 == 0
  str.scan(/../).map {|i| i.hex.chr}.join
end

#object(id, gen) ⇒ Object

Reads an entire PDF object from the buffer and returns it as a Ruby String. If the object is a content stream, returns both the stream and the dictionary that describes it

id - the object ID to return gen - the object revision number to return



186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/pdf/reader/parser.rb', line 186

def object (id, gen)
  Error.assert_equal(parse_token, id)
  Error.assert_equal(parse_token, gen)
  Error.str_assert(parse_token, "obj")

  obj = parse_token
  post_obj = parse_token
  case post_obj
  when "endobj"   then return obj
  when "stream"   then return stream(obj)
  else            raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
  end
end

#parse_token(operators = {}) ⇒ Object

Reads the next token from the underlying buffer and convets it to an appropriate object

operators - a hash of supported operators to read from the underlying buffer.



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/pdf/reader/parser.rb', line 45

def parse_token (operators={})
  ref = Reference.from_buffer(@buffer) and return ref
  token = @buffer.token

  case token
  when nil                        then return nil
  when "/"                        then return @buffer.token.to_sym
  when "<<"                       then return dictionary()
  when "["                        then return array()
  when "("                        then return string()
  when "<"                        then return hex_string()
  when "true"                     then return true
  when "false"                    then return false
  when "null"                     then return nil
  when "obj", "endobj"            then return Token.new(token)
  when "stream", "endstream"      then return Token.new(token)
  when ">>", "]", ">"             then return Token.new(token)
  else
    if operators.has_key?(token)  then return Token.new(token)
    else                          return token.to_f
    end
  end
end

#stream(dict) ⇒ Object

Decodes the contents of a PDF Stream and returns it as a Ruby String.

Raises:



201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/pdf/reader/parser.rb', line 201

def stream (dict)
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
  data = @buffer.read(@xref.object(dict[:Length]))
  
  Error.str_assert(parse_token, "endstream")
  Error.str_assert(parse_token, "endobj")

  if dict.has_key?(:Filter)
    options = []

    if dict.has_key?(:DecodeParms)
      options = Array(dict[:DecodeParms])
    end

    Array(dict[:Filter]).each_with_index do |filter, index|
      data = Filter.new(filter, options[index]).filter(data)
    end
  end

  PDF::Reader::Stream.new(dict, data)
end

#stringObject

Reads a PDF String from the buffer and converts it to a Ruby String



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/pdf/reader/parser.rb', line 115

def string
  str = ""
  count = 1

  while count != 0
    @buffer.ready_token(false, false)

    # find the first occurance of ( ) [ \ or ]
    #
    # I originally just used the regexp form of index(), but it seems to be
    # buggy on some OSX systems (returns nil when there is a match). The
    # block form of index() is more reliable, but only works on 1.8.7 or
    # greater.
    #
    if RUBY_VERSION >= "1.8.7"
      i = @buffer.raw.unpack("C*").index { |n| [40, 41, 91, 92, 93].include?(n) }
    else
      i = @buffer.raw.index(/[\\\(\)]/)
    end

    if i.nil?
      str << @buffer.raw + "\n"
      @buffer.raw.replace("")
      # if a content stream opens a string, but never closes it, we'll
      # hit the end of the stream and still be appending stuff to the
      # string. bad! This check prevents a hard loop.
      raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
      next
    end

    str << @buffer.head(i, false)
    to_remove = 1

    case @buffer.raw[0, 1]
    when "("
      str << "("
      count += 1
    when ")"
      count -= 1
      str << ")" unless count == 0
    when "\\"
      to_remove = 2
      case @buffer.raw[1, 1]
      when ""   then to_remove = 1
      when "n"  then str << "\n"
      when "r"  then str << "\r"
      when "t"  then str << "\t"
      when "b"  then str << "\b"
      when "f"  then str << "\f"
      when "("  then str << "("
      when ")"  then str << ")"
      when "\\" then str << "\\"
      else
        if m = @buffer.raw.match(/^\\(\d{1,3})/)
          to_remove = m[0].size
          str << m[1].oct.chr
        end
      end
    end

    @buffer.head(to_remove, false)
  end
  str
end