Class: CombinePDF::PDFParser

Inherits:
Object
  • Object
show all
Defined in:
lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class.

It takes PDF data and parses it.

The information is then used to initialize a PDF object.

This is an internal class. you don’t need it.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

when creating a parser, it is important to set the data (String) we wish to parse.

the data is required and it is not possible to set the data at a later stage

string

the data to be parsed, as a String object.

Raises:

  • (TypeError)


42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 42

def initialize (string)
  raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
  @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
  @literal_strings = []
  @hex_strings = []
  @streams = []
  @parsed = []
  @root_object = {}
  @info_object = {}
  @version = nil
  @scanner = nil
end

Instance Attribute Details

#info_objectObject (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def info_object
  @info_object
end

#parsedObject (readonly)

the array containing all the parsed data (PDF Objects)



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_objectObject (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def root_object
  @root_object
end

#versionObject (readonly)

a Float representing the PDF version of the data parsed (if exists).



31
32
33
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 31

def version
  @version
end

Instance Method Details

#_parse_Object

the actual recoursive parsing is done here.

this is an internal function, but it was left exposed for posible future features.



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 129

def _parse_
  out = []
  str = ''
  # warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
  while @scanner.rest? do
    case
    ##########################################
    ## parse an Array
    ##########################################
    when @scanner.scan(/\[/)
      out << _parse_
    ##########################################
    ## parse a Dictionary
    ##########################################
    when @scanner.scan(/<</)
      data = _parse_
      obj = {}
      obj[data.shift] = data.shift while data[0]
      out << obj
    ##########################################
    ## return content of array or dictionary
    ##########################################
    when @scanner.scan(/\]/), @scanner.scan(/>>/)
      return out
    ##########################################
    ## parse a Stream
    ##########################################
    when @scanner.scan(/stream[\r]?[\n]/)
      str = @scanner.scan_until(/endstream/)
      # need to remove end of stream
      if out.last.is_a? Hash
        out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
        # out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
      else
        warn "Stream not attached to dictionary!"
        out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
      end
    ##########################################
    ## parse an Object after finished
    ##########################################
    when str = @scanner.scan(/endobj/)
      # warn "Proccessing Object"
      #what to do when this is an object?
      if out.last.is_a? Hash
        out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
      else
        out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
      end
    ##########################################
    ## parse a Hex String
    ##########################################
    when str = @scanner.scan(/<[0-9a-f]+>/)
      # warn "Found a hex string"
      out << [str[1..-2]].pack('H*')
    ##########################################
    ## parse a Literal String
    ##########################################
    when @scanner.scan(/\(/)
      # warn "Found a literal string"
      str = ''
      count = 1
      while count > 0 && @scanner.rest? do
        str += @scanner.scan_until(/[\(\)]/).to_s
        seperator_count = 0
        seperator_count += 1 while str[-2-seperator_count] == "\\"

        case str[-1]
        when '('
          ## The following solution fails when (string ends with this sign: \\)

          count += 1 unless seperator_count.odd?
        when ')'
          count -= 1 unless seperator_count.odd?
        else
          warn "Unknown error parsing string at #{@scanner.pos}!"
          cout = 0 # error
        end
      end
      # The PDF formatted string is: str[0..-2]
      # now staring to convert to regular string
      str_bytes = str[0..-2].bytes
      str = []
      until str_bytes.empty?
        case str_bytes[0]
        when 13 # eol - \r
          # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
          # shall be treated as a byte value of (0Ah),
          # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
          str_bytes.shift
          str_bytes.shift if str_bytes[0] == 10
          str << 10
        when 10 # eol - \n
          # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
          # shall be treated as a byte value of (0Ah),
          # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
          str_bytes.shift
          str_bytes.shift if str_bytes[0] == 13
          str << 10
        when 92 # "\\".ord == 92
          str_bytes.shift
          rep = str_bytes.shift
          case rep
          when 110 #n
            str << 10 #new line
          when 114 #r
            str << 13 # CR
          when 116 #t
            str << 9 #tab
          when 98 #b
            str << 8
          when 102 #f
            str << 255
          when 48..57 #decimal notation for byte?
            rep = rep.chr
            rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
            rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
            str << rep.to_i
          when 10 # new line, ignore
            str_bytes.shift if str_bytes[0] == 13
            true
          when 13 # new line (or double notation for new line), ignore
            str_bytes.shift if str_bytes[0] == 10
            true
          else
            str << rep
          end
        else
          str << str_bytes.shift
        end
      end
      out << str.pack('C*')
    ##########################################
    ## Parse a comment
    ##########################################
    when str = @scanner.scan(/\%/)
      #is a comment, skip until new line
      @scanner.skip_until /[\n\r]+/
    ##########################################
    ## Parse a Name
    ##########################################
    # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
    # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
    # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
    # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
  when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
      out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
    ##########################################
    ## Parse a Number
    ##########################################
    when str = @scanner.scan(/[\+\-\.\d]+/)
      str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
    ##########################################
    ## Parse an Object Reference
    ##########################################
    when @scanner.scan(/R/)
      out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
    ##########################################
    ## Parse Bool - true and after false
    ##########################################
    when @scanner.scan(/true/)
      out << true
    when @scanner.scan(/false/)
      out << false
    ##########################################
    ## Parse NULL - null
    ##########################################
    when @scanner.scan(/null/)
      out << nil
    ##########################################
    ## XREF - check for encryption... anything else?
    ##########################################
    when @scanner.scan(/xref/)
      ##########
      ## get root object to check for encryption
      @scanner.scan_until(/(trailer)|(\%EOF)/)

      if @scanner.matched[-1] == 'r'
        if @scanner.skip_until(/<</)
          data = _parse_
          @root_object = {}
          @root_object[data.shift] = data.shift while data[0]            
        end
        ##########
        ## skip untill end of segment, maked by %%EOF
        @scanner.skip_until(/\%\%EOF/)
      end
      
    when @scanner.scan(/[\s]+/) , @scanner.scan(/obj[\s]*/)
      # do nothing
      # warn "White Space, do nothing"
      nil
    else
      # always advance 
      # warn "Advnacing for unknown reason..."
      @scanner.pos = @scanner.pos + 1
    end
  end
  out
end

#parseObject

parse the data in the new parser (the data already set through the initialize / new method)



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 56

def parse
  return @parsed unless @parsed.empty?
  @scanner = StringScanner.new @string_to_parse
  @scanner.pos = 0
  if @scanner.scan /\%PDF\-[\d\-\.]+/
    @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
  end

  warn "Starting to parse PDF data."
  @parsed = _parse_

  if @root_object == {}
    xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
    xref_streams.each do |xref_dictionary|
      @root_object.merge! xref_dictionary
    end
  end
  raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
  warn "Injecting actual values into root object: #{@root_object}."
  PDFOperations.change_references_to_actual_values @parsed, @root_object

  if @root_object[:Encrypt]
    warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
    decryptor = PDFDecrypt.new @parsed, @root_object
    decryptor.decrypt
    #do we really need to apply to @parsed? No, there is no need.
  end
  if @version >= 1.5 # code placement for object streams
    ## search for objects streams
    object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
    unless object_streams.empty?
      warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
      
      object_streams.each do |o|
        warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
        ## un-encode (using the correct filter) the object streams
        PDFFilter.inflate_object o 
        ## extract objects from stream to top level arry @parsed
        @scanner = StringScanner.new o[:raw_stream_content]
        stream_data = _parse_
        id_array = []
        while stream_data[0].is_a? Fixnum
          id_array << stream_data.shift
          stream_data.shift
        end
        while stream_data[0].is_a? Hash
          stream_data[0][:indirect_reference_id] = id_array.shift
          stream_data[0][:indirect_generation_number] = 0
          @parsed << stream_data.shift
        end
      end
      # ## remove object streams
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
      # ## remove XREF dictionaries
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
    end
  end
  PDFOperations.change_references_to_actual_values @parsed, @root_object
  @info_object = @root_object[:Info]
  if @info_object && @info_object.is_a?(Hash)
    @parsed.delete @info_object
    PDFOperations.change_references_to_actual_values @parsed, @info_object
    PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
  else
    @info_object = {}
  end
  warn "setting parsed collection and returning collection."
  @parsed
end