Class: CombinePDF::PDFParser

Inherits:
Object
  • Object
show all
Defined in:
lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class. It takes PDF data and parses it, returning an array of data. That array can be used to initialize a PDF object. The Parser class doesn’t involve itself with the file version.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

Returns a new instance of PDFParser.

Raises:

  • (TypeError)


30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 30

def initialize (string)
	raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
	@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
	@literal_strings = []
	@hex_strings = []
	@streams = []
	@parsed = []
	@root_object = {}
	@info_object = {}
	@version = nil
	@scanner = nil
end

Instance Attribute Details

#info_objectObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def info_object
  @info_object
end

#parsedObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_objectObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def root_object
  @root_object
end

#versionObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def version
  @version
end

Instance Method Details

#parseObject



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 43

def parse
	return @parsed unless @parsed.empty?
	@scanner = StringScanner.new @string_to_parse
	@scanner.pos = 0
	if @scanner.scan /\%PDF\-[\d\-\.]+/
		@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
	end

	warn "Starting to parse PDF data."
	@parsed = _parse_

	if @root_object == {}
		xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
		xref_streams.each do |xref_dictionary|
			@root_object.merge! xref_dictionary
		end
	end
	raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
	warn "Injecting actual values into root object: #{@root_object}."
	PDFOperations.change_references_to_actual_values @parsed, @root_object

	if @root_object[:Encrypt]
		warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
		decryptor = PDFDecrypt.new @parsed, @root_object
		decryptor.decrypt
		#do we really need to apply to @parsed? No, there is no need.
	end
	if @version >= 1.5 # code placement for object streams
		## search for objects streams
		object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
		unless object_streams.empty?
			warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
			
			object_streams.each do |o|
				warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
				## un-encode (using the correct filter) the object streams
				PDFFilter.inflate_object o 
				## extract objects from stream to top level arry @parsed
				@scanner = StringScanner.new o[:raw_stream_content]
				stream_data = _parse_
				id_array = []
				while stream_data[0].is_a? Fixnum
					id_array << stream_data.shift
					stream_data.shift
				end
				while stream_data[0].is_a? Hash
					stream_data[0][:indirect_reference_id] = id_array.shift
					stream_data[0][:indirect_generation_number] = 0
					@parsed << stream_data.shift
				end
			end
			# ## remove object streams
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
			# ## remove XREF dictionaries
			@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
		end
	end
	PDFOperations.change_references_to_actual_values @parsed, @root_object
	@info_object = @root_object[:Info]
	if @info_object && @info_object.is_a?(Hash)
		@parsed.delete @info_object
		PDFOperations.change_references_to_actual_values @parsed, @info_object
		PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
	else
		@info_object = {}
	end
	warn "setting parsed collection and returning collection."
	@parsed
end