Class: PDF::Reader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/parser.rb

Overview

An internal PDF::Reader class that reads objects from the PDF file and converts them into useable ruby objects (hash’s, arrays, true, false, etc)

Instance Method Summary collapse

Constructor Details

#initialize(buffer, xref) ⇒ Parser

Create a new parser around a PDF::Reader::Buffer object

buffer - a PDF::Reader::Buffer object that contains PDF data xref - an integer that specifies the byte offset of the xref table in the buffer



36
37
38
39
# File 'lib/pdf/reader/parser.rb', line 36

def initialize (buffer, xref)
  @buffer = buffer
  @xref   = xref
end

Instance Method Details

#arrayObject

reads a PDF array from the buffer and converts it to a Ruby Array.



86
87
88
89
90
91
92
93
94
95
96
# File 'lib/pdf/reader/parser.rb', line 86

def array
  a = []

  loop do
    item = parse_token
    break if item.kind_of?(Token) and item == "]"
    a << item
  end

  a
end

#dictionaryObject

reads a PDF dict from the buffer and converts it to a Ruby Hash.



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/pdf/reader/parser.rb', line 69

def dictionary
  dict = {}

  loop do
    key = parse_token
    break if key.kind_of?(Token) and key == ">>"
    raise MalformedPDFError, "PDF malformed, dictionary key is not a name" unless key.kind_of?(Name)

    value = parse_token
    value.kind_of?(Token) and Error.str_assert_not(value, ">>")
    dict[key] = value
  end

  dict
end

#hex_stringObject

Reads a PDF hex string from the buffer and converts it to a Ruby String



99
100
101
102
103
104
105
# File 'lib/pdf/reader/parser.rb', line 99

def hex_string
  str = @buffer.token
  Error.str_assert(@buffer.token, ">")

  str << "0" unless str.size % 2 == 0
  str.scan(/../).map {|i| i.hex.chr}.join
end

#object(id, gen) ⇒ Object

Reads an entire PDF object from the buffer and returns it as a Ruby String.

id - the object ID to return gen - the object revision number to return



162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/pdf/reader/parser.rb', line 162

def object (id, gen)
  Error.assert_equal(parse_token, id)
  Error.assert_equal(parse_token, gen)
  Error.str_assert(parse_token, "obj")

  obj = parse_token
  post_obj = parse_token

  case post_obj
  when "endobj"   then return obj
  when "stream"   then return stream(obj)
  else              raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
  end
end

#parse_token(operators = {}) ⇒ Object

Reads the next token from the underlying buffer and convets it to an appropriate object

operators - a hash of supported operators to read from the underlying buffer.



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/pdf/reader/parser.rb', line 45

def parse_token (operators={})
  ref = Reference.from_buffer(@buffer) and return ref
  token = @buffer.token

  case token
  when "/"                        then return Name.new(@buffer.token)
  when "<<"                       then return dictionary()
  when "["                        then return array()
  when "("                        then return string()
  when "<"                        then return hex_string()
  when "true"                     then return true
  when "false"                    then return false
  when "null"                     then return nil
  when "obj", "endobj"            then return Token.new(token)
  when "stream", "endstream"      then return Token.new(token)
  when ">>", "]", ">"             then return Token.new(token)
  else                          
    if operators.has_key?(token)  then return Token.new(token)
    else                            return token.to_f
    end
  end
end

#stream(dict) ⇒ Object

Decodes the contents of a PDF Stream and returns it as a Ruby String.

Raises:



178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/pdf/reader/parser.rb', line 178

def stream (dict)
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
  data = @buffer.read(@xref.object(dict['Length']))
  Error.str_assert(parse_token, "endstream")
  Error.str_assert(parse_token, "endobj")

  if dict.has_key?('Filter')
    options = []

    if dict.has_key?('DecodeParms')
      options = Array(dict['DecodeParms'])
    end

    Array(dict['Filter']).each_with_index do |filter, index|
      data = Filter.new(filter, options[index]).filter(data)
    end
  end

  # this stream is a cmap
  data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")

  data
end

#stringObject

Reads a PDF String from the buffer and converts it to a Ruby String



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/pdf/reader/parser.rb', line 108

def string
  str = ""
  count = 1

  while count != 0
    @buffer.ready_token(false, false)
    i = @buffer.raw.index(/[\\\(\)]/)

    if i.nil?
      str << @buffer.raw + "\n"
      @buffer.raw.replace("")
      next
    end

    str << @buffer.head(i, false)
    to_remove = 1

    case @buffer.raw[0, 1]
    when "("
      str << "("
      count += 1
    when ")"
      count -= 1
      str << ")" unless count == 0
    when "\\"
      to_remove = 2
      case @buffer.raw[1, 1]
      when ""   then to_remove = 1
      when "n"  then str << "\n"
      when "r"  then str << "\r"
      when "t"  then str << "\t"
      when "b"  then str << "\b"
      when "f"  then str << "\f"
      when "("  then str << "("
      when ")"  then str << ")"
      when "\\" then str << "\\"
      else
        if m = @buffer.raw.match(/^\\(\d{1,3})/)
          to_remove = m[0].size
          str << m[1].oct.chr
        end
      end
    end

    @buffer.head(to_remove, false)
  end

  str
end