Class: CombinePDF::PDFParser

Inherits:

Object

Object
CombinePDF::PDFParser

show all

Defined in:: lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class. It takes PDF data and parses it, returning an array of data. That array can be used to initialize a PDF object. The Parser class doesn’t involve itself with the file version.

Instance Attribute Summary collapse

#info_object ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#parsed ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#root_object ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#version ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.

Instance Method Summary collapse

#initialize(string) ⇒ PDFParser constructor

A new instance of PDFParser.
#parse ⇒ Object

Constructor Details

#initialize(string) ⇒ `PDFParser`

Returns a new instance of PDFParser.

Raises:

(TypeError)

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 30

def initialize (string)
	raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
	@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
	@literal_strings = []
	@hex_strings = []
	@streams = []
	@parsed = []
	@root_object = {}
	@info_object = {}
	@version = nil
	@scanner = nil
end

Instance Attribute Details

#info_object ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def info_object
  @info_object
end

#parsed ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_object ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def root_object
  @root_object
end

#version ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def version
  @version
end

Instance Method Details

#parse ⇒ `Object`

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 43

def parse
	return @parsed unless @parsed.empty?
	@scanner = StringScanner.new @string_to_parse
	@scanner.pos = 0
	if @scanner.scan /\%PDF\-[\d\-\.]+/
		@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
	end

	warn "Starting to parse PDF data."
	@parsed = _parse_

	if @root_object == {}
		xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
		xref_streams.each do |xref_dictionary|
			@root_object.merge! xref_dictionary
		end
	end
	raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
	warn "Injecting actual values into root object: #{@root_object}."
	PDFOperations.change_references_to_actual_values @parsed, @root_object

	if @root_object[:Encrypt]
		warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
		decryptor = PDFDecrypt.new @parsed, @root_object
		decryptor.decrypt
		#do we really need to apply to @parsed? No, there is no need.
	end
	if @version >= 1.5 # code placement for object streams
		## search for objects streams
		object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
		unless object_streams.empty?
			warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
			
			object_streams.each do |o|
				warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
				## un-encode (using the correct filter) the object streams
				PDFFilter.inflate_object o 
				## extract objects from stream to top level arry @parsed
				@scanner = StringScanner.new o[:raw_stream_content]
				stream_data = _parse_
				id_array = []
				while stream_data[0].is_a? Fixnum
					id_array << stream_data.shift
					stream_data.shift
				end
				while stream_data[0].is_a? Hash
					stream_data[0][:indirect_reference_id] = id_array.shift
					stream_data[0][:indirect_generation_number] = 0
					@parsed << stream_data.shift
				end
			end
			# ## remove object streams
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
			# ## remove XREF dictionaries
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
		end
	end
	PDFOperations.change_references_to_actual_values @parsed, @root_object
	@info_object = @root_object[:Info]
	if @info_object && @info_object.is_a?(Hash)
		@parsed.delete @info_object
		PDFOperations.change_references_to_actual_values @parsed, @info_object
		PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
	else
		@info_object = {}
	end
	warn "setting parsed collection and returning collection."
	@parsed
end

Class: CombinePDF::PDFParser

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

Instance Attribute Details

#info_object ⇒ Object (readonly)

#parsed ⇒ Object (readonly)

#root_object ⇒ Object (readonly)

#version ⇒ Object (readonly)

Instance Method Details

#parse ⇒ Object

#initialize(string) ⇒ `PDFParser`

#info_object ⇒ `Object` (readonly)

#parsed ⇒ `Object` (readonly)

#root_object ⇒ `Object` (readonly)

#version ⇒ `Object` (readonly)

#parse ⇒ `Object`