Class: CombinePDF::PDFParser

Inherits:

Object

Object
CombinePDF::PDFParser

show all

Defined in:: lib/combine_pdf/combine_pdf_parser.rb

Overview

This is the Parser class. It takes PDF data and parses it, returning an array of data. That array can be used to initialize a PDF object. The Parser class doesn’t involve itself with the file version.

Instance Attribute Summary collapse

#info_object ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#parsed ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#root_object ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.
#version ⇒ Object readonly

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }.

Instance Method Summary collapse

#initialize(string) ⇒ PDFParser constructor

A new instance of PDFParser.
#parse ⇒ Object

Constructor Details

#initialize(string) ⇒ `PDFParser`

Returns a new instance of PDFParser.

Raises:

(TypeError)

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 30

def initialize (string)
  raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
  @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
  @literal_strings = []
  @hex_strings = []
  @streams = []
  @parsed = []
  @root_object = {}
  @info_object = {}
  @version = nil
  @scanner = nil
end

Instance Attribute Details

#info_object ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def info_object
  @info_object
end

#parsed ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def parsed
  @parsed
end

#root_object ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def root_object
  @root_object
end

#version ⇒ `Object` (readonly)

LITERAL_STRING_REPLACEMENT_HASH = { 110 => 10, # “\n”.bytes = [92, 110] “n”.ord = 10 114 => 13, #r 116 => 9, #t 98 => 8, #b 102 => 255, #f 40 => 40, #( 41 => 41, #) 92 => 92 #\ }



29
30
31

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 29

def version
  @version
end

Instance Method Details

#parse ⇒ `Object`

# File 'lib/combine_pdf/combine_pdf_parser.rb', line 43

def parse
  return @parsed unless @parsed.empty?
  @scanner = StringScanner.new @string_to_parse
  @scanner.pos = 0
  if @scanner.scan /\%PDF\-[\d\-\.]+/
    @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
  end

  warn "Starting to parse PDF data."
  @parsed = _parse_

  if @root_object == {}
    xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
    xref_streams.each do |xref_dictionary|
      @root_object.merge! xref_dictionary
    end
  end
  raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
  warn "Injecting actual values into root object: #{@root_object}."
  PDFOperations.change_references_to_actual_values @parsed, @root_object

  if @root_object[:Encrypt]
    warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
    decryptor = PDFDecrypt.new @parsed, @root_object
    decryptor.decrypt
    #do we really need to apply to @parsed? No, there is no need.
  end
  if @version >= 1.5 # code placement for object streams
    ## search for objects streams
    object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
    unless object_streams.empty?
      warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
      
      object_streams.each do |o|
        warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
        ## un-encode (using the correct filter) the object streams
        PDFFilter.inflate_object o 
        ## extract objects from stream to top level arry @parsed
        @scanner = StringScanner.new o[:raw_stream_content]
        stream_data = _parse_
        id_array = []
        while stream_data[0].is_a? Fixnum
          id_array << stream_data.shift
          stream_data.shift
        end
        while stream_data[0].is_a? Hash
          stream_data[0][:indirect_reference_id] = id_array.shift
          stream_data[0][:indirect_generation_number] = 0
          @parsed << stream_data.shift
        end
      end
      # ## remove object streams
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
      # ## remove XREF dictionaries
      @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
    end
  end
  PDFOperations.change_references_to_actual_values @parsed, @root_object
  @info_object = @root_object[:Info]
  if @info_object && @info_object.is_a?(Hash)
    @parsed.delete @info_object
    PDFOperations.change_references_to_actual_values @parsed, @info_object
    PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
  else
    @info_object = {}
  end
  warn "setting parsed collection and returning collection."
  @parsed
end

Class: CombinePDF::PDFParser

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ PDFParser

Instance Attribute Details

#info_object ⇒ Object (readonly)

#parsed ⇒ Object (readonly)

#root_object ⇒ Object (readonly)

#version ⇒ Object (readonly)

Instance Method Details

#parse ⇒ Object

#initialize(string) ⇒ `PDFParser`

#info_object ⇒ `Object` (readonly)

#parsed ⇒ `Object` (readonly)

#root_object ⇒ `Object` (readonly)

#version ⇒ `Object` (readonly)

#parse ⇒ `Object`