Class: CombinePDF::PDFParser

Inherits:

Object

Object
CombinePDF::PDFParser

show all

Defined in:: lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class.

It takes PDF data and parses it.

The information is then used to initialize a PDF object.

This is an internal class. you don’t need it.

Instance Attribute Summary collapse

#info_object ⇒ Object readonly

the info and root objects, as found (if found) in the PDF file.
#parsed ⇒ Object readonly

the array containing all the parsed data (PDF Objects).
#root_object ⇒ Object readonly

the info and root objects, as found (if found) in the PDF file.
#version ⇒ Object readonly

a Float representing the PDF version of the data parsed (if exists).

Instance Method Summary collapse

#_parse_ ⇒ Object

the actual recoursive parsing is done here.
#initialize(string) ⇒ PDFParser constructor

when creating a parser, it is important to set the data (String) we wish to parse.
#parse ⇒ Object

parse the data in the new parser (the data already set through the initialize / new method).

Constructor Details

#initialize(string) ⇒ `PDFParser`

when creating a parser, it is important to set the data (String) we wish to parse.

the data is required and it is not possible to set the data at a later stage

string: the data to be parsed, as a String object.

Raises:

(TypeError)

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 42

def initialize (string)
	raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
	@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
	@literal_strings = []
	@hex_strings = []
	@streams = []
	@parsed = []
	@root_object = {}
	@info_object = {}
	@version = nil
	@scanner = nil
end

Instance Attribute Details

#info_object ⇒ `Object` (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def info_object
  @info_object
end

#parsed ⇒ `Object` (readonly)

the array containing all the parsed data (PDF Objects)



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_object ⇒ `Object` (readonly)

the info and root objects, as found (if found) in the PDF file.

they are mainly to used to know if the file is (was) encrypted and to get more details.



35
36
37

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 35

def root_object
  @root_object
end

#version ⇒ `Object` (readonly)

a Float representing the PDF version of the data parsed (if exists).



31
32
33

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 31

def version
  @version
end

Instance Method Details

#_parse_ ⇒ `Object`

the actual recoursive parsing is done here.

this is an internal function, but it was left exposed for posible future features.

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 129

def _parse_
	out = []
	str = ''
	# warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
	while @scanner.rest? do
		case
		##########################################
		## parse an Array
		##########################################
		when @scanner.scan(/\[/)
			out << _parse_
		##########################################
		## parse a Dictionary
		##########################################
		when @scanner.scan(/<</)
			data = _parse_
			obj = {}
			obj[data.shift] = data.shift while data[0]
			out << obj
		##########################################
		## return content of array or dictionary
		##########################################
		when @scanner.scan(/\]/), @scanner.scan(/>>/)
			return out
		##########################################
		## parse a Stream
		##########################################
		when @scanner.scan(/stream[\r]?[\n]/)
			str = @scanner.scan_until(/endstream/)
			# need to remove end of stream
			if out.last.is_a? Hash
				out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
				# out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
			else
				warn "Stream not attached to dictionary!"
				out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
			end
		##########################################
		## parse an Object after finished
		##########################################
		when str = @scanner.scan(/endobj/)
			# warn "Proccessing Object"
			#what to do when this is an object?
			if out.last.is_a? Hash
				out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
			else
				out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
			end
		##########################################
		## parse a Hex String
		##########################################
		when str = @scanner.scan(/<[0-9a-f]+>/)
			# warn "Found a hex string"
			out << [str[1..-2]].pack('H*')
		##########################################
		## parse a Literal String
		##########################################
		when @scanner.scan(/\(/)
			# warn "Found a literal string"
			str = ''
			count = 1
			while count > 0 && @scanner.rest? do
				str += @scanner.scan_until(/[\(\)]/).to_s
				seperator_count = 0
				seperator_count += 1 while str[-2-seperator_count] == "\\"

				case str[-1]
				when '('
					## The following solution fails when (string ends with this sign: \\)

					count += 1 unless seperator_count.odd?
				when ')'
					count -= 1 unless seperator_count.odd?
				else
					warn "Unknown error parsing string at #{@scanner.pos}!"
					cout = 0 # error
				end
			end
			# The PDF formatted string is: str[0..-2]
			# now staring to convert to regular string
			str_bytes = str[0..-2].bytes
			str = []
			until str_bytes.empty?
				case str_bytes[0]
				when 13 # eol - \r
					# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
					# shall be treated as a byte value of (0Ah),
					# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
					str_bytes.shift
					str_bytes.shift if str_bytes[0] == 10
					str << 10
				when 10 # eol - \n
					# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
					# shall be treated as a byte value of (0Ah),
					# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
					str_bytes.shift
					str_bytes.shift if str_bytes[0] == 13
					str << 10
				when 92 # "\\".ord == 92
					str_bytes.shift
					rep = str_bytes.shift
					case rep
					when 110 #n
						str << 10 #new line
					when 114 #r
						str << 13 # CR
					when 116 #t
						str << 9 #tab
					when 98 #b
						str << 8
					when 102 #f
						str << 255
					when 48..57 #decimal notation for byte?
						rep = rep.chr
						rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
						rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
						str << rep.to_i
					when 10 # new line, ignore
						str_bytes.shift if str_bytes[0] == 13
						true
					when 13 # new line (or double notation for new line), ignore
						str_bytes.shift if str_bytes[0] == 10
						true
					else
						str << rep
					end
				else
					str << str_bytes.shift
				end
			end
			out << str.pack('C*')
		##########################################
		## Parse a comment
		##########################################
		when str = @scanner.scan(/\%/)
			#is a comment, skip until new line
			@scanner.skip_until /[\n\r]+/
		##########################################
		## Parse a Name
		##########################################
		# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
		# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
		# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
		# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
	when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
			out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
		##########################################
		## Parse a Number
		##########################################
		when str = @scanner.scan(/[\+\-\.\d]+/)
			str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
		##########################################
		## Parse an Object Reference
		##########################################
		when @scanner.scan(/R/)
			out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
		##########################################
		## Parse Bool - true and after false
		##########################################
		when @scanner.scan(/true/)
			out << true
		when @scanner.scan(/false/)
			out << false
		##########################################
		## Parse NULL - null
		##########################################
		when @scanner.scan(/null/)
			out << nil
		##########################################
		## XREF - check for encryption... anything else?
		##########################################
		when @scanner.scan(/xref/)
			##########
			## get root object to check for encryption
			@scanner.scan_until(/(trailer)|(\%EOF)/)

			if @scanner.matched[-1] == 'r'
				if @scanner.skip_until(/<</)
					data = _parse_
					@root_object = {}
					@root_object[data.shift] = data.shift while data[0]						
				end
				##########
				## skip untill end of segment, maked by %%EOF
				@scanner.skip_until(/\%\%EOF/)
			end
			
		when @scanner.scan(/[\s]+/) , @scanner.scan(/obj[\s]*/)
			# do nothing
			# warn "White Space, do nothing"
			nil
		else
			# always advance 
			# warn "Advnacing for unknown reason..."
			@scanner.pos = @scanner.pos + 1
		end
	end
	out
end

#parse ⇒ `Object`

parse the data in the new parser (the data already set through the initialize / new method)

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 56

def parse
	return @parsed unless @parsed.empty?
	@scanner = StringScanner.new @string_to_parse
	@scanner.pos = 0
	if @scanner.scan /\%PDF\-[\d\-\.]+/
		@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
	end

	warn "Starting to parse PDF data."
	@parsed = _parse_

	if @root_object == {}
		xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
		xref_streams.each do |xref_dictionary|
			@root_object.merge! xref_dictionary
		end
	end
	raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
	warn "Injecting actual values into root object: #{@root_object}."
	PDFOperations.change_references_to_actual_values @parsed, @root_object

	if @root_object[:Encrypt]
		warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
		decryptor = PDFDecrypt.new @parsed, @root_object
		decryptor.decrypt
		#do we really need to apply to @parsed? No, there is no need.
	end
	if @version >= 1.5 # code placement for object streams
		## search for objects streams
		object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
		unless object_streams.empty?
			warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
			
			object_streams.each do |o|
				warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
				## un-encode (using the correct filter) the object streams
				PDFFilter.inflate_object o 
				## extract objects from stream to top level arry @parsed
				@scanner = StringScanner.new o[:raw_stream_content]
				stream_data = _parse_
				id_array = []
				while stream_data[0].is_a? Fixnum
					id_array << stream_data.shift
					stream_data.shift
				end
				while stream_data[0].is_a? Hash
					stream_data[0][:indirect_reference_id] = id_array.shift
					stream_data[0][:indirect_generation_number] = 0
					@parsed << stream_data.shift
				end
			end
			# ## remove object streams
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
			# ## remove XREF dictionaries
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
		end
	end
	PDFOperations.change_references_to_actual_values @parsed, @root_object
	@info_object = @root_object[:Info]
	if @info_object && @info_object.is_a?(Hash)
		@parsed.delete @info_object
		PDFOperations.change_references_to_actual_values @parsed, @info_object
		PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
	else
		@info_object = {}
	end
	warn "setting parsed collection and returning collection."
	@parsed
end

Class: CombinePDF::PDFParser

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

Instance Attribute Details

#info_object ⇒ Object (readonly)

#parsed ⇒ Object (readonly)

#root_object ⇒ Object (readonly)

#version ⇒ Object (readonly)