Class: CombinePDF::PDFParser

Inherits:
Object
  • Object
show all
Defined in:
lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class. It takes PDF data and parses it, returning an array of data. That array can be used to initialize a PDF object. The Parser class doesn’t involve itself with the file version.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

Returns a new instance of PDFParser.

Raises:

  • (TypeError)


30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 30

def initialize (string)
  raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
  @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
  @literal_strings = []
  @hex_strings = []
  @streams = []
  @parsed = []
  @root_object = {}
  @info_object = {}
  @version = nil
  @scanner = nil
end

Instance Attribute Details

#info_objectObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def info_object
  @info_object
end

#parsedObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_objectObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def root_object
  @root_object
end

#versionObject (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def version
  @version
end

Instance Method Details

#parseObject



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/combine_pdf/combine_pdf_parser.rb', line 43

def parse
  return @parsed unless @parsed.empty?
  @scanner = StringScanner.new @string_to_parse
  @scanner.pos = 0
  if @scanner.scan /\%PDF\-[\d\-\.]+/
    @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
  end

  warn "Starting to parse PDF data."
  @parsed = _parse_

  if @root_object == {}
    xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
    xref_streams.each do |xref_dictionary|
      @root_object.merge! xref_dictionary
    end
  end
  raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
  warn "Injecting actual values into root object: #{@root_object}."
  PDFOperations.change_references_to_actual_values @parsed, @root_object

  if @root_object[:Encrypt]
    warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
    decryptor = PDFDecrypt.new @parsed, @root_object
    decryptor.decrypt
    #do we really need to apply to @parsed? No, there is no need.
  end
  if @version >= 1.5 # code placement for object streams
    ## search for objects streams
    object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
    unless object_streams.empty?
      warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
      
      object_streams.each do |o|
        warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
        ## un-encode (using the correct filter) the object streams
        PDFFilter.inflate_object o 
        ## extract objects from stream to top level arry @parsed
        @scanner = StringScanner.new o[:raw_stream_content]
        stream_data = _parse_
        id_array = []
        while stream_data[0].is_a? Fixnum
          id_array << stream_data.shift
          stream_data.shift
        end
        while stream_data[0].is_a? Hash
          stream_data[0][:indirect_reference_id] = id_array.shift
          stream_data[0][:indirect_generation_number] = 0
          @parsed << stream_data.shift
        end
      end
      # ## remove object streams
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
      # ## remove XREF dictionaries
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
    end
  end
  PDFOperations.change_references_to_actual_values @parsed, @root_object
  @info_object = @root_object[:Info]
  if @info_object && @info_object.is_a?(Hash)
    @parsed.delete @info_object
    PDFOperations.change_references_to_actual_values @parsed, @info_object
    PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
  else
    @info_object = {}
  end
  warn "setting parsed collection and returning collection."
  @parsed
end