Class: AcroThat::ObjectResolver
- Inherits:
-
Object
- Object
- AcroThat::ObjectResolver
- Defined in:
- lib/acro_that/object_resolver.rb
Overview
Parses xref (tables and streams) and exposes object bodies uniformly, including objects embedded in /ObjStm. Also gives you the trailer and /Root.
Defined Under Namespace
Classes: Entry
Instance Method Summary collapse
- #apply_png_predictor(data, columns) ⇒ Object
- #balanced_from(str, start_idx) ⇒ Object
-
#clear_cache ⇒ Object
Clear the object stream cache to free memory.
- #decode_stream_data(dict_src, stream_chunk) ⇒ Object
- #each_object ⇒ Object
- #find_startxref(bytes) ⇒ Object
-
#initialize(bytes) ⇒ ObjectResolver
constructor
A new instance of ObjectResolver.
- #JSON_like_array(tok) ⇒ Object
- #load_objstm(container_ref) ⇒ Object
- #object_body(ref) ⇒ Object
- #paeth_predictor(a, b, c) ⇒ Object
- #parse_classic_xref(start) ⇒ Object
-
#parse_cross_reference ⇒ Object
— internals ———————————————————–.
- #parse_xref_at_offset(offset) ⇒ Object
- #parse_xref_stream_at(header_ofs) ⇒ Object
- #parse_xref_stream_records(raw, w, index) ⇒ Object
- #read_int(scanner, width) ⇒ Object
- #root_ref ⇒ Object
- #trailer_dict ⇒ Object
Constructor Details
#initialize(bytes) ⇒ ObjectResolver
Returns a new instance of ObjectResolver.
9 10 11 12 13 14 |
# File 'lib/acro_that/object_resolver.rb', line 9 def initialize(bytes) @bytes = bytes @entries = {} @objstm_cache = {} parse_cross_reference end |
Instance Method Details
#apply_png_predictor(data, columns) ⇒ Object
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
# File 'lib/acro_that/object_resolver.rb', line 263 def apply_png_predictor(data, columns) # PNG predictor: each row starts with a filter byte, followed by 'columns' data bytes row_size = columns + 1 # 1 byte for predictor + columns bytes of data num_rows = data.bytesize / row_size result = [] prev_row = [0] * columns num_rows.times do |i| row_start = i * row_size filter_type = data.getbyte(row_start) row_bytes = (1..columns).map { |j| data.getbyte(row_start + j) } decoded_row = case filter_type when 0 # None row_bytes when 1 # Sub out = [] columns.times do |j| left = j.positive? ? out[j - 1] : 0 out << ((row_bytes[j] + left) & 0xFF) end out when 2 # Up row_bytes.map.with_index { |b, j| (b + prev_row[j]) & 0xFF } when 3 # Average out = [] columns.times do |j| left = j.positive? ? out[j - 1] : 0 up = prev_row[j] out << ((row_bytes[j] + ((left + up) / 2)) & 0xFF) end out when 4 # Paeth out = [] columns.times do |j| left = j.positive? ? out[j - 1] : 0 up = prev_row[j] up_left = j.positive? ? prev_row[j - 1] : 0 out << ((row_bytes[j] + paeth_predictor(left, up, up_left)) & 0xFF) end out else row_bytes # Unknown filter, pass through end result.concat(decoded_row) prev_row = decoded_row end result.pack("C*") end |
#balanced_from(str, start_idx) ⇒ Object
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
# File 'lib/acro_that/object_resolver.rb', line 330 def balanced_from(str, start_idx) depth = 0 j = start_idx while j < str.length if str[j, 2] == "<<" depth += 1 j += 2 elsif str[j, 2] == ">>" depth -= 1 j += 2 return j if depth.zero? else j += 1 end end raise "unterminated dict" end |
#clear_cache ⇒ Object
Clear the object stream cache to free memory
53 54 55 |
# File 'lib/acro_that/object_resolver.rb', line 53 def clear_cache @objstm_cache.clear end |
#decode_stream_data(dict_src, stream_chunk) ⇒ Object
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 |
# File 'lib/acro_that/object_resolver.rb', line 239 def decode_stream_data(dict_src, stream_chunk) s_match = /\bstream\r?\n/.match(stream_chunk) or raise "stream keyword missing" body = stream_chunk[s_match.end(0)..] body = body.sub(/\bendstream\b.*/m, "") # Decompress if FlateDecode (handle both "/Filter /FlateDecode" and "/Filter/FlateDecode") data = if dict_src =~ %r{/Filter\s*/FlateDecode} Zlib::Inflate.inflate(body) else body end # Apply PNG predictor if present if dict_src =~ %r{/DecodeParms\s*<<[^>]*/Predictor\s+(\d+)} predictor = ::Regexp.last_match(1).to_i if predictor.between?(10, 15) # PNG predictors columns = dict_src =~ %r{/Columns\s+(\d+)} ? ::Regexp.last_match(1).to_i : 1 data = apply_png_predictor(data, columns) end end data end |
#each_object ⇒ Object
46 47 48 49 50 |
# File 'lib/acro_that/object_resolver.rb', line 46 def each_object @entries.each_key do |ref| yield(ref, object_body(ref)) end end |
#find_startxref(bytes) ⇒ Object
348 349 350 351 352 353 354 355 356 357 358 359 360 |
# File 'lib/acro_that/object_resolver.rb', line 348 def find_startxref(bytes) return nil if bytes.nil? || bytes.empty? if bytes =~ /startxref\s+(\d+)\s*%%EOF\s*\z/m return Integer(::Regexp.last_match(1)) end m = bytes.rindex("startxref") return nil unless m tail = bytes[m, bytes.length - m] tail[/startxref\s+(\d+)/m, 1]&.to_i end |
#JSON_like_array(tok) ⇒ Object
234 235 236 237 |
# File 'lib/acro_that/object_resolver.rb', line 234 def JSON_like_array(tok) inner = tok[1..-2] inner.split(/\s+/).map { |t| t =~ /\A\d+\z/ ? t.to_i : t } end |
#load_objstm(container_ref) ⇒ Object
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
# File 'lib/acro_that/object_resolver.rb', line 362 def load_objstm(container_ref) return if @objstm_cache.key?(container_ref) body = object_body(container_ref) raise "Object stream #{container_ref.inspect} not found in xref table" unless body dict_start = body.index("<<") || 0 dict_end = balanced_from(body, dict_start) dict_src = body[dict_start...dict_end] s_pos = body.index(/\bstream\r?\n/m, dict_end) or raise "objstm stream missing" e_pos = body.index(/\bendstream\b/m, s_pos) or raise "objstm end missing" data = body[s_pos..e_pos] raw = decode_stream_data(dict_src, data) n = DictScan.value_token_after("/N", dict_src).to_i first = DictScan.value_token_after("/First", dict_src).to_i parsed = AcroThat::ObjStm.parse(raw, n: n, first: first) @objstm_cache[container_ref] = parsed end |
#object_body(ref) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/acro_that/object_resolver.rb', line 57 def object_body(ref) case (e = @entries[ref])&.type when :in_file i = e.offset # Find "obj" start near offset (handle any preceding whitespace) hdr = /\bobj\b/m.match(@bytes, i) or return nil after = hdr.end(0) # Skip optional whitespace and one line break if present after += 1 while (ch = @bytes.getbyte(after)) && ch <= 0x20 j = @bytes.index(/\bendobj\b/m, after) or return nil @bytes[after...j] when :in_objstm load_objstm([e.objstm_num, 0]) @objstm_cache[[e.objstm_num, 0]][e.objstm_index][:body] end end |
#paeth_predictor(a, b, c) ⇒ Object
315 316 317 318 319 320 321 322 323 324 325 326 327 328 |
# File 'lib/acro_that/object_resolver.rb', line 315 def paeth_predictor(a, b, c) # a = left, b = up, c = up-left p = a + b - c pa = (p - a).abs pb = (p - b).abs pc = (p - c).abs if pa <= pb && pa <= pc a elsif pb <= pc b else c end end |
#parse_classic_xref(start) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/acro_that/object_resolver.rb', line 111 def parse_classic_xref(start) pos = @bytes.rindex("xref", start) or raise "xref not found" i = pos + 4 loop do m = /\s*(\d+)\s+(\d+)/m.match(@bytes, i) or break first = m[1].to_i count = m[2].to_i i = m.end(0) count.times do |k| # Skip whitespace/newlines before reading the 20-byte record i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D, 0x20].include?(ch) rec = @bytes[i, 20] raise "bad xref record" unless rec && rec.bytesize == 20 off = rec[0, 10].to_i gen = rec[11, 5].to_i typ = rec[17, 1] i += 20 # consume line ending(s) i += 1 while (ch = @bytes.getbyte(i)) && [0x0A, 0x0D].include?(ch) ref = [first + k, gen] @entries[ref] ||= Entry.new(type: :in_file, offset: off) if typ == "n" # (ignore 'f' free entries) end break if @bytes[i, 7] == "trailer" end tpos = @bytes.index("trailer", i) if tpos dpos = @bytes.index("<<", tpos) if dpos dend = balanced_from(@bytes, dpos) @last_xref_stream_dict = nil @trailer_explicit = @bytes[dpos...dend] return @trailer_explicit end end # No trailer found (might be at an intermediate xref in the chain) nil end |
#parse_cross_reference ⇒ Object
— internals ———————————————————–
76 77 78 79 |
# File 'lib/acro_that/object_resolver.rb', line 76 def parse_cross_reference start = find_startxref(@bytes) or raise "startxref not found" parse_xref_at_offset(start) end |
#parse_xref_at_offset(offset) ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/acro_that/object_resolver.rb', line 81 def parse_xref_at_offset(offset) # 1) If 'xref' is literally at that offset => classic table if @bytes[offset, 4] == "xref" tr = parse_classic_xref(offset) # 2) Classic trailers may include /XRefStm <offset> to an xref stream with compressed entries xrefstm_tok = DictScan.value_token_after("/XRefStm", tr) if tr if xrefstm_tok && (ofs = xrefstm_tok.to_i).positive? parse_xref_stream_at(ofs) # merge entries from xref stream (type 0/1/2) end # 3) Follow /Prev pointer if present prev_tok = DictScan.value_token_after("/Prev", tr) if tr if prev_tok && (prev_ofs = prev_tok.to_i).positive? parse_xref_at_offset(prev_ofs) end else # Direct xref stream case (offset points to the xref stream obj header) dict_src = parse_xref_stream_at(offset) # Follow /Prev in the xref stream's dictionary if dict_src prev_tok = DictScan.value_token_after("/Prev", dict_src) if prev_tok && (prev_ofs = prev_tok.to_i).positive? parse_xref_at_offset(prev_ofs) end end end end |
#parse_xref_stream_at(header_ofs) ⇒ Object
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/acro_that/object_resolver.rb', line 158 def parse_xref_stream_at(header_ofs) # Expect "<num> <gen> obj" at header_ofs m = /\A(\d+)\s+(\d+)\s+obj\b/m.match(@bytes[header_ofs, 50]) unless m # Sometimes header_ofs might land on whitespace; search forward a bit win = @bytes[header_ofs, 256] m2 = /(\d+)\s+(\d+)\s+obj\b/m.match(win) or raise "xref stream header not found" header_ofs += m2.begin(0) m = m2 end obj_ref = [m[1].to_i, m[2].to_i] dpos = @bytes.index("<<", header_ofs + m[0].length) or raise "xref stream dict missing" dend = balanced_from(@bytes, dpos) dict_src = @bytes[dpos...dend] @last_xref_stream_dict ||= dict_src # Keep first one for trailer_dict spos = @bytes.index(/\bstream\r?\n/m, dend) or raise "xref stream body missing" epos = @bytes.index(/\bendstream\b/m, spos) or raise "xref stream end missing" data = @bytes[spos..epos] raw = decode_stream_data(dict_src, data) # W is mandatory in xref streams; if missing, bail (don't crash) w_tok = DictScan.value_token_after("/W", dict_src) return nil unless w_tok w = JSON_like_array(w_tok) idx_tok = DictScan.value_token_after("/Index", dict_src) index = idx_tok ? JSON_like_array(idx_tok) : [0, DictScan.value_token_after("/Size", dict_src).to_i] parse_xref_stream_records(raw, w, index) # Ensure the xref stream object itself is registered (type 1 entry usually exists, # but if not, add it so object_body can find the stream if needed) unless @entries.key?(obj_ref) # Approximate offset at header_ofs @entries[obj_ref] = Entry.new(type: :in_file, offset: header_ofs) end dict_src # Return dict for /Prev checking end |
#parse_xref_stream_records(raw, w, index) ⇒ Object
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/acro_that/object_resolver.rb', line 200 def parse_xref_stream_records(raw, w, index) w0, w1, w2 = w s = StringScanner.new(raw) (0...(index.length / 2)).each do |i| obj = index[2 * i].to_i count = index[(2 * i) + 1].to_i count.times do |k| t = read_int(s, w0) f1 = read_int(s, w1) f2 = read_int(s, w2) ref = [obj + k, 0] case t when 0 then next # free when 1 then @entries[ref] ||= Entry.new(type: :in_file, offset: f1) when 2 then @entries[ref] ||= Entry.new(type: :in_objstm, objstm_num: f1, objstm_index: f2) end end end end |
#read_int(scanner, width) ⇒ Object
220 221 222 223 224 225 226 227 228 229 230 231 232 |
# File 'lib/acro_that/object_resolver.rb', line 220 def read_int(scanner, width) # Ensure width is an integer w = width.is_a?(Integer) ? width : width.to_i return 0 if w.zero? bytes = scanner.peek(w) return 0 unless bytes && bytes.bytesize == w scanner.pos += w val = 0 bytes.each_byte { |b| val = (val << 8) | b } val end |
#root_ref ⇒ Object
16 17 18 19 20 21 |
# File 'lib/acro_that/object_resolver.rb', line 16 def root_ref tr = trailer_dict return nil unless tr =~ %r{/Root\s+(\d+)\s+(\d+)\s+R} [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))] end |
#trailer_dict ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/acro_that/object_resolver.rb', line 23 def trailer_dict # Priority order: # 1. Explicit trailer from classic xref (incremental updates) # 2. Xref stream dictionary (original PDFs) # 3. Search for trailer (fallback) @trailer_dict ||= if @trailer_explicit @trailer_explicit elsif @last_xref_stream_dict @last_xref_stream_dict else # Find last 'trailer << ... >>' before last startxref start = find_startxref(@bytes) || 0 head = @bytes[0...start] idx = head.rindex("trailer") raise "trailer not found" unless idx # naive grab following dict m = head.index("<<", idx) n = balanced_from(head, m) head[m...n] end end |