Class: AcroThat::ObjectResolver

Inherits:
Object
  • Object
show all
Defined in:
lib/acro_that/object_resolver.rb

Overview

Parses xref (tables and streams) and exposes object bodies uniformly, including objects embedded in /ObjStm. Also gives you the trailer and /Root.

Defined Under Namespace

Classes: Entry

Instance Method Summary collapse

Constructor Details

#initialize(bytes) ⇒ ObjectResolver

Returns a new instance of ObjectResolver.



9
10
11
12
13
14
# File 'lib/acro_that/object_resolver.rb', line 9

def initialize(bytes)
  @bytes = bytes
  @entries = {}
  @objstm_cache = {}
  parse_cross_reference
end

Instance Method Details

#apply_png_predictor(data, columns) ⇒ Object



263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/acro_that/object_resolver.rb', line 263

def apply_png_predictor(data, columns)
  # PNG predictor: each row starts with a filter byte, followed by 'columns' data bytes
  row_size = columns + 1  # 1 byte for predictor + columns bytes of data
  num_rows = data.bytesize / row_size
  result = []
  prev_row = [0] * columns

  num_rows.times do |i|
    row_start = i * row_size
    filter_type = data.getbyte(row_start)
    row_bytes = (1..columns).map { |j| data.getbyte(row_start + j) }

    decoded_row = case filter_type
                  when 0  # None
                    row_bytes
                  when 1  # Sub
                    out = []
                    columns.times do |j|
                      left = j.positive? ? out[j - 1] : 0
                      out << ((row_bytes[j] + left) & 0xFF)
                    end
                    out
                  when 2  # Up
                    row_bytes.map.with_index { |b, j| (b + prev_row[j]) & 0xFF }
                  when 3  # Average
                    out = []
                    columns.times do |j|
                      left = j.positive? ? out[j - 1] : 0
                      up = prev_row[j]
                      out << ((row_bytes[j] + ((left + up) / 2)) & 0xFF)
                    end
                    out
                  when 4  # Paeth
                    out = []
                    columns.times do |j|
                      left = j.positive? ? out[j - 1] : 0
                      up = prev_row[j]
                      up_left = j.positive? ? prev_row[j - 1] : 0
                      out << ((row_bytes[j] + paeth_predictor(left, up, up_left)) & 0xFF)
                    end
                    out
                  else
                    row_bytes # Unknown filter, pass through
                  end

    result.concat(decoded_row)
    prev_row = decoded_row
  end

  result.pack("C*")
end

#balanced_from(str, start_idx) ⇒ Object



330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/acro_that/object_resolver.rb', line 330

def balanced_from(str, start_idx)
  depth = 0
  j = start_idx
  while j < str.length
    if str[j, 2] == "<<"
      depth += 1
      j += 2
    elsif str[j, 2] == ">>"
      depth -= 1
      j += 2
      return j if depth.zero?
    else
      j += 1
    end
  end
  raise "unterminated dict"
end

#clear_cacheObject

Clear the object stream cache to free memory



53
54
55
# File 'lib/acro_that/object_resolver.rb', line 53

def clear_cache
  @objstm_cache.clear
end

#decode_stream_data(dict_src, stream_chunk) ⇒ Object



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# File 'lib/acro_that/object_resolver.rb', line 239

def decode_stream_data(dict_src, stream_chunk)
  s_match = /\bstream\r?\n/.match(stream_chunk) or raise "stream keyword missing"
  body = stream_chunk[s_match.end(0)..]
  body = body.sub(/\bendstream\b.*/m, "")

  # Decompress if FlateDecode (handle both "/Filter /FlateDecode" and "/Filter/FlateDecode")
  data = if dict_src =~ %r{/Filter\s*/FlateDecode}
           Zlib::Inflate.inflate(body)
         else
           body
         end

  # Apply PNG predictor if present
  if dict_src =~ %r{/DecodeParms\s*<<[^>]*/Predictor\s+(\d+)}
    predictor = ::Regexp.last_match(1).to_i
    if predictor.between?(10, 15) # PNG predictors
      columns = dict_src =~ %r{/Columns\s+(\d+)} ? ::Regexp.last_match(1).to_i : 1
      data = apply_png_predictor(data, columns)
    end
  end

  data
end

#each_objectObject



46
47
48
49
50
# File 'lib/acro_that/object_resolver.rb', line 46

def each_object
  @entries.each_key do |ref|
    yield(ref, object_body(ref))
  end
end

#find_startxref(bytes) ⇒ Object



348
349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/acro_that/object_resolver.rb', line 348

def find_startxref(bytes)
  return nil if bytes.nil? || bytes.empty?

  if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m
    return Integer(::Regexp.last_match(1))
  end

  m = bytes.rindex("startxref")
  return nil unless m

  tail = bytes[m, bytes.length - m]
  tail[/startxref\s+(\d+)/m, 1]&.to_i
end

#JSON_like_array(tok) ⇒ Object



234
235
236
237
# File 'lib/acro_that/object_resolver.rb', line 234

def JSON_like_array(tok)
  inner = tok[1..-2]
  inner.split(/\s+/).map { |t| t =~ /\A\d+\z/ ? t.to_i : t }
end

#load_objstm(container_ref) ⇒ Object



362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# File 'lib/acro_that/object_resolver.rb', line 362

def load_objstm(container_ref)
  return if @objstm_cache.key?(container_ref)

  body = object_body(container_ref)
  raise "Object stream #{container_ref.inspect} not found in xref table" unless body

  dict_start = body.index("<<") || 0
  dict_end = balanced_from(body, dict_start)
  dict_src = body[dict_start...dict_end]
  s_pos = body.index(/\bstream\r?\n/m, dict_end) or raise "objstm stream missing"
  e_pos = body.index(/\bendstream\b/m, s_pos) or raise "objstm end missing"
  data = body[s_pos..e_pos]
  raw = decode_stream_data(dict_src, data)
  n = DictScan.value_token_after("/N", dict_src).to_i
  first = DictScan.value_token_after("/First", dict_src).to_i
  parsed = AcroThat::ObjStm.parse(raw, n: n, first: first)
  @objstm_cache[container_ref] = parsed
end

#object_body(ref) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/acro_that/object_resolver.rb', line 57

def object_body(ref)
  case (e = @entries[ref])&.type
  when :in_file
    i = e.offset
    # Find "obj" start near offset (handle any preceding whitespace)
    hdr = /\bobj\b/m.match(@bytes, i) or return nil
    after = hdr.end(0)
    # Skip optional whitespace and one line break if present
    after += 1 while (ch = @bytes.getbyte(after)) && ch <= 0x20
    j = @bytes.index(/\bendobj\b/m, after) or return nil
    @bytes[after...j]
  when :in_objstm
    load_objstm([e.objstm_num, 0])
    @objstm_cache[[e.objstm_num, 0]][e.objstm_index][:body]
  end
end

#paeth_predictor(a, b, c) ⇒ Object



315
316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/acro_that/object_resolver.rb', line 315

def paeth_predictor(a, b, c)
  # a = left, b = up, c = up-left
  p = a + b - c
  pa = (p - a).abs
  pb = (p - b).abs
  pc = (p - c).abs
  if pa <= pb && pa <= pc
    a
  elsif pb <= pc
    b
  else
    c
  end
end

#parse_classic_xref(start) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/acro_that/object_resolver.rb', line 111

def parse_classic_xref(start)
  pos = @bytes.rindex("xref", start) or raise "xref not found"
  i = pos + 4

  loop do
    m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break
    first = m[1].to_i
    count = m[2].to_i
    i = m.end(0)

    count.times do |k|
      # Skip whitespace/newlines before reading the 20-byte record
      i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch)

      rec = @bytes[i, 20]
      raise "bad xref record" unless rec && rec.bytesize == 20

      off = rec[0, 10].to_i
      gen = rec[11, 5].to_i
      typ = rec[17, 1]
      i += 20
      # consume line ending(s)
      i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch)

      ref = [first + k, gen]
      @entries[ref] ||= Entry.new(type: :in_file, offset: off) if typ == "n"
      # (ignore 'f' free entries)
    end

    break if @bytes[i, 7] == "trailer"
  end

  tpos = @bytes.index("trailer", i)
  if tpos
    dpos = @bytes.index("<<", tpos)
    if dpos
      dend = balanced_from(@bytes, dpos)
      @last_xref_stream_dict = nil
      @trailer_explicit = @bytes[dpos...dend]
      return @trailer_explicit
    end
  end

  # No trailer found (might be at an intermediate xref in the chain)
  nil
end

#parse_cross_referenceObject

— internals ———————————————————–



76
77
78
79
# File 'lib/acro_that/object_resolver.rb', line 76

def parse_cross_reference
  start = find_startxref(@bytes) or raise "startxref not found"
  parse_xref_at_offset(start)
end

#parse_xref_at_offset(offset) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/acro_that/object_resolver.rb', line 81

def parse_xref_at_offset(offset)
  # 1) If 'xref' is literally at that offset => classic table
  if @bytes[offset, 4] == "xref"
    tr = parse_classic_xref(offset)

    # 2) Classic trailers may include /XRefStm <offset> to an xref stream with compressed entries
    xrefstm_tok = DictScan.value_token_after("/XRefStm", tr) if tr
    if xrefstm_tok && (ofs = xrefstm_tok.to_i).positive?
      parse_xref_stream_at(ofs) # merge entries from xref stream (type 0/1/2)
    end

    # 3) Follow /Prev pointer if present
    prev_tok = DictScan.value_token_after("/Prev", tr) if tr
    if prev_tok && (prev_ofs = prev_tok.to_i).positive?
      parse_xref_at_offset(prev_ofs)
    end
  else
    # Direct xref stream case (offset points to the xref stream obj header)
    dict_src = parse_xref_stream_at(offset)

    # Follow /Prev in the xref stream's dictionary
    if dict_src
      prev_tok = DictScan.value_token_after("/Prev", dict_src)
      if prev_tok && (prev_ofs = prev_tok.to_i).positive?
        parse_xref_at_offset(prev_ofs)
      end
    end
  end
end

#parse_xref_stream_at(header_ofs) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/acro_that/object_resolver.rb', line 158

def parse_xref_stream_at(header_ofs)
  # Expect "<num> <gen> obj" at header_ofs
  m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50])
  unless m
    # Sometimes header_ofs might land on whitespace; search forward a bit
    win = @bytes[header_ofs, 256]
    m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or raise "xref stream header not found"
    header_ofs += m2.begin(0)
    m = m2
  end
  obj_ref = [m[1].to_i, m[2].to_i]

  dpos = @bytes.index("<<", header_ofs + m[0].length) or raise "xref stream dict missing"
  dend = balanced_from(@bytes, dpos)
  dict_src = @bytes[dpos...dend]
  @last_xref_stream_dict ||= dict_src # Keep first one for trailer_dict

  spos = @bytes.index(/\bstream\r?\n/m, dend) or raise "xref stream body missing"
  epos = @bytes.index(/\bendstream\b/m, spos) or raise "xref stream end missing"
  data = @bytes[spos..epos]
  raw = decode_stream_data(dict_src, data)

  # W is mandatory in xref streams; if missing, bail (don't crash)
  w_tok = DictScan.value_token_after("/W", dict_src)
  return nil unless w_tok

  w = JSON_like_array(w_tok)
  idx_tok = DictScan.value_token_after("/Index", dict_src)
  index = idx_tok ? JSON_like_array(idx_tok) : [0, DictScan.value_token_after("/Size", dict_src).to_i]

  parse_xref_stream_records(raw, w, index)

  # Ensure the xref stream object itself is registered (type 1 entry usually exists,
  # but if not, add it so object_body can find the stream if needed)
  unless @entries.key?(obj_ref)
    # Approximate offset at header_ofs
    @entries[obj_ref] = Entry.new(type: :in_file, offset: header_ofs)
  end

  dict_src # Return dict for /Prev checking
end

#parse_xref_stream_records(raw, w, index) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/acro_that/object_resolver.rb', line 200

def parse_xref_stream_records(raw, w, index)
  w0, w1, w2 = w
  s = StringScanner.new(raw)
  (0...(index.length / 2)).each do |i|
    obj = index[2 * i].to_i
    count = index[(2 * i) + 1].to_i
    count.times do |k|
      t  = read_int(s, w0)
      f1 = read_int(s, w1)
      f2 = read_int(s, w2)
      ref = [obj + k, 0]
      case t
      when 0 then next # free
      when 1 then @entries[ref] ||= Entry.new(type: :in_file, offset: f1)
      when 2 then @entries[ref] ||= Entry.new(type: :in_objstm, objstm_num: f1, objstm_index: f2)
      end
    end
  end
end

#read_int(scanner, width) ⇒ Object



220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'lib/acro_that/object_resolver.rb', line 220

def read_int(scanner, width)
  # Ensure width is an integer
  w = width.is_a?(Integer) ? width : width.to_i
  return 0 if w.zero?

  bytes = scanner.peek(w)
  return 0 unless bytes && bytes.bytesize == w

  scanner.pos += w
  val = 0
  bytes.each_byte { |b| val = (val << 8) | b }
  val
end

#root_refObject



16
17
18
19
20
21
# File 'lib/acro_that/object_resolver.rb', line 16

def root_ref
  tr = trailer_dict
  return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R}

  [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
end

#trailer_dictObject



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/acro_that/object_resolver.rb', line 23

def trailer_dict
  # Priority order:
  # 1. Explicit trailer from classic xref (incremental updates)
  # 2. Xref stream dictionary (original PDFs)
  # 3. Search for trailer (fallback)
  @trailer_dict ||= if @trailer_explicit
                      @trailer_explicit
                    elsif @last_xref_stream_dict
                      @last_xref_stream_dict
                    else
                      # Find last 'trailer << ... >>' before last startxref
                      start = find_startxref(@bytes) || 0
                      head = @bytes[0...start]
                      idx = head.rindex("trailer")
                      raise "trailer not found" unless idx

                      # naive grab following dict
                      m = head.index("<<", idx)
                      n = balanced_from(head, m)
                      head[m...n]
                    end
end