Class: CombinePDF::PDFParser
- Inherits:
-
Object
- Object
- CombinePDF::PDFParser
- Defined in:
- lib/combine_pdf/combine_pdf_parser.rb
Overview
This is the Parser class. It takes PDF data and parses it, returning an array of data. That array can be used to initialize a PDF object. The Parser class doesn’t involve itself with the file version.
Instance Attribute Summary collapse
-
#info_object ⇒ Object
readonly
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
-
#parsed ⇒ Object
readonly
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
-
#root_object ⇒ Object
readonly
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
-
#version ⇒ Object
readonly
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
Instance Method Summary collapse
-
#initialize(string) ⇒ PDFParser
constructor
A new instance of PDFParser.
- #parse ⇒ Object
Constructor Details
#initialize(string) ⇒ PDFParser
Returns a new instance of PDFParser.
30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 30 def initialize (string) raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT) @literal_strings = [] @hex_strings = [] @streams = [] @parsed = [] @root_object = {} @info_object = {} @version = nil @scanner = nil end |
Instance Attribute Details
#info_object ⇒ Object (readonly)
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }
29 30 31 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29 def info_object @info_object end |
#parsed ⇒ Object (readonly)
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }
29 30 31 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29 def parsed @parsed end |
#root_object ⇒ Object (readonly)
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }
29 30 31 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29 def root_object @root_object end |
#version ⇒ Object (readonly)
LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }
29 30 31 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29 def version @version end |
Instance Method Details
#parse ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 43 def parse return @parsed unless @parsed.empty? @scanner = StringScanner.new @string_to_parse @scanner.pos = 0 if @scanner.scan /\%PDF\-[\d\-\.]+/ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f end warn "Starting to parse PDF data." @parsed = _parse_ if @root_object == {} xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef} xref_streams.each do |xref_dictionary| @root_object.merge! xref_dictionary end end raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {} warn "Injecting actual values into root object: #{@root_object}." PDFOperations.change_references_to_actual_values @parsed, @root_object if @root_object[:Encrypt] warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported." decryptor = PDFDecrypt.new @parsed, @root_object decryptor.decrypt #do we really need to apply to @parsed? No, there is no need. end if @version >= 1.5 # code placement for object streams ## search for objects streams object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm} unless object_streams.empty? warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects." object_streams.each do |o| warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}" ## un-encode (using the correct filter) the object streams PDFFilter.inflate_object o ## extract objects from stream to top level arry @parsed @scanner = StringScanner.new o[:raw_stream_content] stream_data = _parse_ id_array = [] while stream_data[0].is_a? Fixnum id_array << stream_data.shift stream_data.shift end while stream_data[0].is_a? Hash stream_data[0][:indirect_reference_id] = id_array.shift stream_data[0][:indirect_generation_number] = 0 @parsed << stream_data.shift end end # ## remove object streams @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm} # ## remove XREF dictionaries @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef} end end PDFOperations.change_references_to_actual_values @parsed, @root_object @info_object = @root_object[:Info] if @info_object && @info_object.is_a?(Hash) @parsed.delete @info_object PDFOperations.change_references_to_actual_values @parsed, @info_object PRIVATE_HASH_KEYS.each {|key| @info_object.delete key} else @info_object = {} end warn "setting parsed collection and returning collection." @parsed end |