Class: CombinePDF::PDFParser

Inherits:
Object
  • Object
show all
Defined in:
lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class.

It takes PDF data and parses it.

The information is then used to initialize a PDF object.

This is an internal class. you don’t need it.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

when creating a parser, it is important to set the data (String) we wish to parse.

the data is required and it is not possible to set the data at a later stage

string

the data to be parsed, as a String object.

Raises:

  • (TypeError)


42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 42

def initialize (string)
	raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
	@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
	@literal_strings = []
	@hex_strings = []
	@streams = []
	@parsed = []
	@root_object = {}
	@info_object = {}
	@version = nil
	@scanner = nil
end

Instance Attribute Details

#info_objectObject (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def info_object
  @info_object
end

#parsedObject (readonly)

the array containing all the parsed data (PDF Objects)



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_objectObject (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def root_object
  @root_object
end

#versionObject (readonly)

a Float representing the PDF version of the data parsed (if exists).



31
32
33
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 31

def version
  @version
end

Instance Method Details

#_parse_Object

the actual recoursive parsing is done here.

this is an internal function, but it was left exposed for posible future features.



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 129

def _parse_
	out = []
	str = ''
	# warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
	while @scanner.rest? do
		case
		##########################################
		## parse an Array
		##########################################
		when @scanner.scan(/\[/)
			out << _parse_
		##########################################
		## parse a Dictionary
		##########################################
		when @scanner.scan(/<</)
			data = _parse_
			obj = {}
			obj[data.shift] = data.shift while data[0]
			out << obj
		##########################################
		## return content of array or dictionary
		##########################################
		when @scanner.scan(/\]/), @scanner.scan(/>>/)
			return out
		##########################################
		## parse a Stream
		##########################################
		when @scanner.scan(/stream[\r]?[\n]/)
			str = @scanner.scan_until(/endstream/)
			# need to remove end of stream
			if out.last.is_a? Hash
				out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
				# out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
			else
				warn "Stream not attached to dictionary!"
				out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
			end
		##########################################
		## parse an Object after finished
		##########################################
		when str = @scanner.scan(/endobj/)
			# warn "Proccessing Object"
			#what to do when this is an object?
			if out.last.is_a? Hash
				out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
			else
				out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
			end
		##########################################
		## parse a Hex String
		##########################################
		when str = @scanner.scan(/<[0-9a-f]+>/)
			# warn "Found a hex string"
			out << [str[1..-2]].pack('H*')
		##########################################
		## parse a Literal String
		##########################################
		when @scanner.scan(/\(/)
			# warn "Found a literal string"
			str = ''
			count = 1
			while count > 0 && @scanner.rest? do
				str += @scanner.scan_until(/[\(\)]/).to_s
				seperator_count = 0
				seperator_count += 1 while str[-2-seperator_count] == "\\"

				case str[-1]
				when '('
					## The following solution fails when (string ends with this sign: \\)

					count += 1 unless seperator_count.odd?
				when ')'
					count -= 1 unless seperator_count.odd?
				else
					warn "Unknown error parsing string at #{@scanner.pos}!"
					cout = 0 # error
				end
			end
			# The PDF formatted string is: str[0..-2]
			# now staring to convert to regular string
			str_bytes = str[0..-2].bytes
			str = []
			until str_bytes.empty?
				case str_bytes[0]
				when 13 # eol - \r
					# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
					# shall be treated as a byte value of (0Ah),
					# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
					str_bytes.shift
					str_bytes.shift if str_bytes[0] == 10
					str << 10
				when 10 # eol - \n
					# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
					# shall be treated as a byte value of (0Ah),
					# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
					str_bytes.shift
					str_bytes.shift if str_bytes[0] == 13
					str << 10
				when 92 # "\\".ord == 92
					str_bytes.shift
					rep = str_bytes.shift
					case rep
					when 110 #n
						str << 10 #new line
					when 114 #r
						str << 13 # CR
					when 116 #t
						str << 9 #tab
					when 98 #b
						str << 8
					when 102 #f
						str << 255
					when 48..57 #decimal notation for byte?
						rep = rep.chr
						rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
						rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
						str << rep.to_i
					when 10 # new line, ignore
						str_bytes.shift if str_bytes[0] == 13
						true
					when 13 # new line (or double notation for new line), ignore
						str_bytes.shift if str_bytes[0] == 10
						true
					else
						str << rep
					end
				else
					str << str_bytes.shift
				end
			end
			out << str.pack('C*')
		##########################################
		## Parse a comment
		##########################################
		when str = @scanner.scan(/\%/)
			#is a comment, skip until new line
			@scanner.skip_until /[\n\r]+/
		##########################################
		## Parse a Name
		##########################################
		# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
		# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
		# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
		# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
	when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
			out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
		##########################################
		## Parse a Number
		##########################################
		when str = @scanner.scan(/[\+\-\.\d]+/)
			str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
		##########################################
		## Parse an Object Reference
		##########################################
		when @scanner.scan(/R/)
			out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
		##########################################
		## Parse Bool - true and after false
		##########################################
		when @scanner.scan(/true/)
			out << true
		when @scanner.scan(/false/)
			out << false
		##########################################
		## Parse NULL - null
		##########################################
		when @scanner.scan(/null/)
			out << nil
		##########################################
		## XREF - check for encryption... anything else?
		##########################################
		when @scanner.scan(/xref/)
			##########
			## get root object to check for encryption
			@scanner.scan_until(/(trailer)|(\%EOF)/)

			if @scanner.matched[-1] == 'r'
				if @scanner.skip_until(/<</)
					data = _parse_
					@root_object = {}
					@root_object[data.shift] = data.shift while data[0]						
				end
				##########
				## skip untill end of segment, maked by %%EOF
				@scanner.skip_until(/\%\%EOF/)
			end
			
		when @scanner.scan(/[\s]+/) , @scanner.scan(/obj[\s]*/)
			# do nothing
			# warn "White Space, do nothing"
			nil
		else
			# always advance 
			# warn "Advnacing for unknown reason..."
			@scanner.pos = @scanner.pos + 1
		end
	end
	out
end

#parseObject

parse the data in the new parser (the data already set through the initialize / new method)



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 56

def parse
	return @parsed unless @parsed.empty?
	@scanner = StringScanner.new @string_to_parse
	@scanner.pos = 0
	if @scanner.scan /\%PDF\-[\d\-\.]+/
		@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
	end

	warn "Starting to parse PDF data."
	@parsed = _parse_

	if @root_object == {}
		xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
		xref_streams.each do |xref_dictionary|
			@root_object.merge! xref_dictionary
		end
	end
	raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
	warn "Injecting actual values into root object: #{@root_object}."
	PDFOperations.change_references_to_actual_values @parsed, @root_object

	if @root_object[:Encrypt]
		warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
		decryptor = PDFDecrypt.new @parsed, @root_object
		decryptor.decrypt
		#do we really need to apply to @parsed? No, there is no need.
	end
	if @version >= 1.5 # code placement for object streams
		## search for objects streams
		object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
		unless object_streams.empty?
			warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
			
			object_streams.each do |o|
				warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
				## un-encode (using the correct filter) the object streams
				PDFFilter.inflate_object o 
				## extract objects from stream to top level arry @parsed
				@scanner = StringScanner.new o[:raw_stream_content]
				stream_data = _parse_
				id_array = []
				while stream_data[0].is_a? Fixnum
					id_array << stream_data.shift
					stream_data.shift
				end
				while stream_data[0].is_a? Hash
					stream_data[0][:indirect_reference_id] = id_array.shift
					stream_data[0][:indirect_generation_number] = 0
					@parsed << stream_data.shift
				end
			end
			# ## remove object streams
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
			# ## remove XREF dictionaries
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
		end
	end
	PDFOperations.change_references_to_actual_values @parsed, @root_object
	@info_object = @root_object[:Info]
	if @info_object && @info_object.is_a?(Hash)
		@parsed.delete @info_object
		PDFOperations.change_references_to_actual_values @parsed, @info_object
		PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
	else
		@info_object = {}
	end
	warn "setting parsed collection and returning collection."
	@parsed
end