Module: CorpPdf::DictScan

Defined in:
lib/corp_pdf/dict_scan.rb

Class Method Summary collapse

Class Method Details

.add_ref_to_array(array_body, ref) ⇒ Object



361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# File 'lib/corp_pdf/dict_scan.rb', line 361

def add_ref_to_array(array_body, ref)
  num, gen = ref
  ref_token = "#{num} #{gen} R"

  # Handle empty array
  if array_body.strip == "[]"
    return "[#{ref_token}]"
  end

  # Add before the closing bracket, with proper spacing
  # Find the last ']' and insert before it
  if array_body.strip.end_with?("]")
    # Remove trailing ] and add ref, then add ] back
    without_closing = array_body.rstrip.chomp("]")
    return "#{without_closing} #{ref_token}]"
  end

  # Fallback: just append
  "#{array_body} #{ref_token}"
end

.add_ref_to_inline_array(dict_body, key, ref) ⇒ Object



392
393
394
395
396
397
398
399
400
401
# File 'lib/corp_pdf/dict_scan.rb', line 392

def add_ref_to_inline_array(dict_body, key, ref)
  return nil unless dict_body.include?(key)

  # Extract the inline array token after key, then rebuild
  arr_tok = value_token_after(key, dict_body)
  return nil unless arr_tok && arr_tok.start_with?("[")

  new_arr_tok = add_ref_to_array(arr_tok, ref)
  dict_body.sub(arr_tok) { |_| new_arr_tok }
end

.appearance_choice_for(new_value, dict_src) ⇒ Object



343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/corp_pdf/dict_scan.rb', line 343

def appearance_choice_for(new_value, dict_src)
  # If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off
  return nil unless dict_src.include?("/AP")

  # Simplistic detection
  yes = dict_src.include?("/Yes")
  off = dict_src.include?("/Off")
  case new_value
  when true, :Yes, "Yes" then yes ? "/Yes" : nil
  when false, :Off, "Off" then off ? "/Off" : nil
  end
end

.decode_pdf_string(token) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/corp_pdf/dict_scan.rb', line 100

def decode_pdf_string(token)
  return nil unless token

  t = token.strip

  # Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM
  if t.start_with?("(") && t.end_with?(")")
    inner = t[1..-2]
    s = unescape_literal(inner)
    if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF
      return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
    else
      return s.b
              .force_encoding("binary")
              .encode("UTF-8", invalid: :replace, undef: :replace)
    end
  end

  # Hex string: < ... > with optional UTF-16BE BOM
  if t.start_with?("<") && t.end_with?(">")
    hex = t[1..-2].gsub(/\s+/, "")
    hex << "0" if hex.length.odd?
    bytes = [hex].pack("H*")
    if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF
      return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8")
    else
      return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace)
    end
  end

  # Fallback: return token as-is (names, numbers, refs, etc.)
  t
end

.each_dictionary(str) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/corp_pdf/dict_scan.rb', line 33

def each_dictionary(str)
  i = 0
  while (open = str.index("<<", i))
    depth = 0
    j = open
    found = nil
    while j < str.length
      if str[j, 2] == "<<"
        depth += 1
        j += 2
      elsif str[j, 2] == ">>"
        depth -= 1
        j += 2
        if depth.zero?
          found = str[open...j]
          break
        end
      else
        j += 1
      end
    end
    break unless found

    yield found
    i = j
  end
end

.encode_pdf_name(name) ⇒ Object

Encode a string as a PDF name, escaping special characters with hex encoding PDF names must escape: # ( ) < > [ ] { } / % and control characters Example: “(Two Hr) Priority 2” becomes “/#28Two Hr#29 Priority 2”



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/corp_pdf/dict_scan.rb', line 161

def encode_pdf_name(name)
  name_str = name.to_s
  # Remove leading / if present (we'll add it back)
  name_str = name_str[1..] if name_str.start_with?("/")

  # Transliterate special characters to ASCII to avoid encoding issues
  ascii_name = transliterate_to_ascii(name_str)

  # Encode special characters as hex
  encoded = ascii_name.each_byte.map do |byte|
    char = byte.chr
    # PDF name special characters that need hex encoding: # ( ) < > [ ] { } / %
    # Also encode control characters (0x00-0x1F, 0x7F) and non-ASCII (0x80-0xFF)
    if ["#", "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"].include?(char) ||
       byte.between?(0x00, 0x1F) || byte == 0x7F || byte.between?(0x80, 0xFF)
      # Hex encode: # followed by 2-digit hex
      "##{byte.to_s(16).upcase.rjust(2, '0')}"
    else
      # Regular printable ASCII: use as-is
      char
    end
  end.join

  "/#{encoded}"
end

.encode_pdf_string(val) ⇒ Object



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/corp_pdf/dict_scan.rb', line 134

def encode_pdf_string(val)
  case val
  when true then "true"
  when false then "false"
  when Symbol
    "/#{val}"
  when String
    # Transliterate special characters to ASCII to avoid encoding issues
    ascii_val = transliterate_to_ascii(val)

    if ascii_val.ascii_only?
      "(#{ascii_val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})"
    else
      # Ensure string is in UTF-8 before encoding to UTF-16BE
      utf8_str = ascii_val.encode("UTF-8", invalid: :replace, undef: :replace)
      utf16 = utf8_str.encode("UTF-16BE")
      bytes = "\xFE\xFF#{utf16}"
      "<#{bytes.unpack1('H*')}>"
    end
  else
    val.to_s
  end
end

.format_pdf_key(key) ⇒ Object

Format a metadata key as a PDF dictionary key (ensure it starts with /)



188
189
190
191
# File 'lib/corp_pdf/dict_scan.rb', line 188

def format_pdf_key(key)
  key_str = key.to_s
  key_str.start_with?("/") ? key_str : "/#{key_str}"
end

.format_pdf_value(value) ⇒ Object

Format a metadata value appropriately for PDF



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/corp_pdf/dict_scan.rb', line 194

def format_pdf_value(value)
  case value
  when Integer, Float
    value.to_s
  when String
    # If it looks like a PDF string (starts with parenthesis or angle bracket), use as-is
    if value.start_with?("(") || value.start_with?("<") || value.start_with?("/")
      value
    else
      # Otherwise encode as a PDF string
      encode_pdf_string(value)
    end
  when Array
    # Array format: [item1 item2 item3]
    items = value.map { |v| format_pdf_value(v) }.join(" ")
    "[#{items}]"
  when Hash
    # Dictionary format: << /Key1 value1 /Key2 value2 >>
    dict = value.map do |k, v|
      pdf_key = format_pdf_key(k)
      pdf_val = format_pdf_value(v)
      "  #{pdf_key} #{pdf_val}"
    end.join("\n")
    "<<\n#{dict}\n>>"
  else
    value.to_s
  end
end

.is_multiline_field?(dict_body) ⇒ Boolean

Check if a field is multiline by checking /Ff flag bit 12 (0x1000)

Returns:

  • (Boolean)


417
418
419
420
421
422
423
424
425
426
# File 'lib/corp_pdf/dict_scan.rb', line 417

def is_multiline_field?(dict_body)
  return false unless dict_body

  ff_tok = value_token_after("/Ff", dict_body)
  return false unless ff_tok

  ff_value = ff_tok.to_i
  # Bit 12 (0x1000) indicates multiline text field
  ff_value.anybits?(0x1000)
end

.is_page?(body) ⇒ Boolean

Check if a body represents a page object (not /Type/Pages)

Returns:

  • (Boolean)


410
411
412
413
414
# File 'lib/corp_pdf/dict_scan.rb', line 410

def is_page?(body)
  return false unless body

  body.include?("/Type /Page") || body =~ %r{/Type\s*/Page(?!s)\b}
end

.is_widget?(body) ⇒ Boolean

Returns:

  • (Boolean)


403
404
405
406
407
# File 'lib/corp_pdf/dict_scan.rb', line 403

def is_widget?(body)
  return false unless body

  body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget}
end

.parse_box(body, box_type) ⇒ Object

Parse a box array (MediaBox, CropBox, ArtBox, BleedBox, TrimBox, etc.) Returns a hash with keys :llx, :lly, :urx, :ury, or nil if not found/invalid



430
431
432
433
434
435
436
437
438
439
# File 'lib/corp_pdf/dict_scan.rb', line 430

def parse_box(body, box_type)
  pattern = %r{/#{box_type}\s*\[(.*?)\]}
  return nil unless body =~ pattern

  box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
  return nil unless box_values.length == 4

  llx, lly, urx, ury = box_values
  { llx: llx, lly: lly, urx: urx, ury: ury }
end

.remove_appearance_stream(dict_body) ⇒ Object

Remove /AP (appearance stream) entry from a dictionary



442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# File 'lib/corp_pdf/dict_scan.rb', line 442

def remove_appearance_stream(dict_body)
  return dict_body unless dict_body&.include?("/AP")

  # Find /AP entry using pattern matching
  ap_key_pattern = %r{/AP(?=[\s(<\[/])}
  ap_match = dict_body.match(ap_key_pattern)
  return dict_body unless ap_match

  key_end = ap_match.end(0)
  value_start = key_end
  value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/
  return dict_body if value_start >= dict_body.length

  # Determine what type of value we have
  first_char = dict_body[value_start]
  value_end = value_start

  if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<"
    # Inline dictionary: /AP << ... >>
    # Need to find matching closing >>
    depth = 0
    i = value_start
    while i < dict_body.length
      if dict_body[i, 2] == "<<"
        depth += 1
        i += 2
      elsif dict_body[i, 2] == ">>"
        depth -= 1
        i += 2
        if depth.zero?
          value_end = i
          break
        end
      else
        i += 1
      end
    end
  elsif ["(", "<", "["].include?(first_char)
    # Use value_token_after to get the complete token
    ap_tok = value_token_after("/AP", dict_body)
    return dict_body unless ap_tok

    value_end = value_start + ap_tok.length
  else
    # Reference or other simple token
    ap_tok = value_token_after("/AP", dict_body)
    return dict_body unless ap_tok

    value_end = value_start + ap_tok.length
  end

  # Skip trailing whitespace after the value
  value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/

  # Find the start of /AP (may need to remove preceding space/newline)
  removal_start = ap_match.begin(0)

  # Try to remove preceding whitespace/newline if it's on its own line
  if removal_start.positive? && dict_body[removal_start - 1] == "\n"
    # Check if there's whitespace before the newline we should remove too
    line_start = removal_start - 1
    line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/
    # Only remove the line if it starts with whitespace (indentation)
    if line_start.positive? && dict_body[line_start - 1] == "\n"
      removal_start = line_start
    end
  end

  # Build result without /AP entry
  before = dict_body[0...removal_start]
  after = dict_body[value_end..]
  result = "#{before}#{after}"

  # Verify the result still has valid dictionary structure
  unless result.include?("<<") && result.include?(">>")
    return dict_body # Return original if corrupted
  end

  result
end

.remove_ref_from_array(array_body, ref) ⇒ Object



356
357
358
359
# File 'lib/corp_pdf/dict_scan.rb', line 356

def remove_ref_from_array(array_body, ref)
  num, gen = ref
  array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]")
end

.remove_ref_from_inline_array(dict_body, key, ref) ⇒ Object



382
383
384
385
386
387
388
389
390
# File 'lib/corp_pdf/dict_scan.rb', line 382

def remove_ref_from_inline_array(dict_body, key, ref)
  return nil unless dict_body.include?(key)

  # Extract the inline array token after key, then rebuild
  arr_tok = value_token_after(key, dict_body)
  return nil unless arr_tok && arr_tok.start_with?("[")

  dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) }
end

.replace_key_value(dict_src, key, new_token) ⇒ Object



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/corp_pdf/dict_scan.rb', line 297

def replace_key_value(dict_src, key, new_token)
  # Replace existing key's value token in a single dictionary source string (<<...>>)
  # Use precise position-based replacement to avoid any regex issues

  # Find the key position using pattern matching
  key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])}
  key_match = dict_src.match(key_pattern)
  return upsert_key_value(dict_src, key, new_token) unless key_match

  # Get the existing value token
  tok = value_token_after(key, dict_src)
  return upsert_key_value(dict_src, key, new_token) unless tok

  # Find exact positions
  key_match.begin(0)
  key_end = key_match.end(0)

  # Skip whitespace after key
  value_start = key_end
  value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/

  # Verify the token matches at this position
  unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok
    # Token doesn't match - fallback to upsert
    return upsert_key_value(dict_src, key, new_token)
  end

  # Replace using precise string slicing - this preserves everything exactly
  before = dict_src[0...value_start]
  after = dict_src[(value_start + tok.length)..]
  result = "#{before}#{new_token}#{after}"

  # Verify the result still has valid dictionary structure
  unless result.include?("<<") && result.include?(">>")
    # Dictionary corrupted - return original
    return dict_src
  end

  result
end

.strip_stream_bodies(pdf) ⇒ Object

— low-level string helpers ————————————————-



29
30
31
# File 'lib/corp_pdf/dict_scan.rb', line 29

def strip_stream_bodies(pdf)
  pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" }
end

.transliterate_to_ascii(str) ⇒ Object

Transliterate a string to ASCII, converting special characters to their ASCII equivalents Example: “María Valentina” -> “Maria Valentina”



12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/corp_pdf/dict_scan.rb', line 12

def transliterate_to_ascii(str)
  return str unless str.is_a?(String)

  # Ensure the string is in UTF-8 encoding
  utf8_str = str.encode("UTF-8", invalid: :replace, undef: :replace)

  # Use I18n transliteration to convert to ASCII
  begin
    I18n.transliterate(utf8_str, locale: :en, replacement: "")
  rescue StandardError
    # Fallback: if transliteration fails, try to encode to ASCII with replacements
    utf8_str.encode("ASCII", invalid: :replace, undef: :replace)
  end
end

.unescape_literal(s) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/corp_pdf/dict_scan.rb', line 61

def unescape_literal(s)
  out = +""
  i = 0
  while i < s.length
    ch = s[i]
    if ch == "\\"
      i += 1
      break if i >= s.length

      esc = s[i]
      case esc
      when "n" then out << "\n"
      when "r" then out << "\r"
      when "t" then out << "\t"
      when "b" then out << "\b"
      when "f" then out << "\f"
      when "\\", "(", ")" then out << esc
      when /\d/
        oct = esc
        if i + 1 < s.length && s[i + 1] =~ /\d/
          i += 1
          oct << s[i]
          if i + 1 < s.length && s[i + 1] =~ /\d/
            i += 1
            oct << s[i]
          end
        end
        out << oct.to_i(8).chr
      else
        out << esc
      end
    else
      out << ch
    end
    i += 1
  end
  out
end

.upsert_key_value(dict_src, key, token) ⇒ Object



338
339
340
341
# File 'lib/corp_pdf/dict_scan.rb', line 338

def upsert_key_value(dict_src, key, token)
  # Insert right after '<<' with a space between key and value
  dict_src.sub("<<") { |_| "<<#{key} #{token}" }
end

.value_token_after(key, dict_src) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# File 'lib/corp_pdf/dict_scan.rb', line 223

def value_token_after(key, dict_src)
  # Find key followed by delimiter (whitespace, (, <, [, /)
  # Use regex to ensure key is a complete token
  match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])})
  return nil unless match

  i = match.end(0)
  i += 1 while i < dict_src.length && dict_src[i] =~ /\s/
  return nil if i >= dict_src.length

  case dict_src[i]
  when "("
    depth = 0
    j = i
    while j < dict_src.length
      ch = dict_src[j]
      if ch == "\\"
        j += 2
        next
      end
      depth += 1 if ch == "("
      if ch == ")"
        depth -= 1
        if depth.zero?
          j += 1
          return dict_src[i...j]
        end
      end
      j += 1
    end
    nil
  when "<"
    if dict_src[i, 2] == "<<"
      "<<"
    else
      j = dict_src.index(">", i)
      j ? dict_src[i..j] : nil
    end
  when "["
    # Array token - find matching closing bracket
    depth = 0
    j = i
    while j < dict_src.length
      ch = dict_src[j]
      if ch == "["
        depth += 1
      elsif ch == "]"
        depth -= 1
        if depth.zero?
          j += 1
          return dict_src[i...j]
        end
      end
      j += 1
    end
    nil
  when "/"
    # PDF name token - extract until whitespace or delimiter
    j = i
    while j < dict_src.length
      ch = dict_src[j]
      # PDF names can contain most characters except NUL, whitespace, and delimiters
      break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i)

      j += 1
    end
    j > i ? dict_src[i...j] : nil
  else
    # atom
    m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..])
    m ? m[1] : nil
  end
end