Module: CorpPdf::DictScan
- Defined in:
- lib/corp_pdf/dict_scan.rb
Class Method Summary collapse
- .add_ref_to_array(array_body, ref) ⇒ Object
- .add_ref_to_inline_array(dict_body, key, ref) ⇒ Object
- .appearance_choice_for(new_value, dict_src) ⇒ Object
- .decode_pdf_string(token) ⇒ Object
- .each_dictionary(str) ⇒ Object
-
.encode_pdf_name(name) ⇒ Object
Encode a string as a PDF name, escaping special characters with hex encoding PDF names must escape: # ( ) < > [ ] { } / % and control characters Example: “(Two Hr) Priority 2” becomes “/#28Two Hr#29 Priority 2”.
- .encode_pdf_string(val) ⇒ Object
-
.format_pdf_key(key) ⇒ Object
Format a metadata key as a PDF dictionary key (ensure it starts with /).
-
.format_pdf_value(value) ⇒ Object
Format a metadata value appropriately for PDF.
-
.is_multiline_field?(dict_body) ⇒ Boolean
Check if a field is multiline by checking /Ff flag bit 12 (0x1000).
-
.is_page?(body) ⇒ Boolean
Check if a body represents a page object (not /Type/Pages).
- .is_widget?(body) ⇒ Boolean
-
.parse_box(body, box_type) ⇒ Object
Parse a box array (MediaBox, CropBox, ArtBox, BleedBox, TrimBox, etc.) Returns a hash with keys :llx, :lly, :urx, :ury, or nil if not found/invalid.
-
.remove_appearance_stream(dict_body) ⇒ Object
Remove /AP (appearance stream) entry from a dictionary.
- .remove_ref_from_array(array_body, ref) ⇒ Object
- .remove_ref_from_inline_array(dict_body, key, ref) ⇒ Object
- .replace_key_value(dict_src, key, new_token) ⇒ Object
-
.strip_stream_bodies(pdf) ⇒ Object
— low-level string helpers ————————————————-.
-
.transliterate_to_ascii(str) ⇒ Object
Transliterate a string to ASCII, converting special characters to their ASCII equivalents Example: “María Valentina” -> “Maria Valentina”.
- .unescape_literal(s) ⇒ Object
- .upsert_key_value(dict_src, key, token) ⇒ Object
- .value_token_after(key, dict_src) ⇒ Object
Class Method Details
.add_ref_to_array(array_body, ref) ⇒ Object
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
# File 'lib/corp_pdf/dict_scan.rb', line 361 def add_ref_to_array(array_body, ref) num, gen = ref ref_token = "#{num} #{gen} R" # Handle empty array if array_body.strip == "[]" return "[#{ref_token}]" end # Add before the closing bracket, with proper spacing # Find the last ']' and insert before it if array_body.strip.end_with?("]") # Remove trailing ] and add ref, then add ] back without_closing = array_body.rstrip.chomp("]") return "#{without_closing} #{ref_token}]" end # Fallback: just append "#{array_body} #{ref_token}" end |
.add_ref_to_inline_array(dict_body, key, ref) ⇒ Object
392 393 394 395 396 397 398 399 400 401 |
# File 'lib/corp_pdf/dict_scan.rb', line 392 def add_ref_to_inline_array(dict_body, key, ref) return nil unless dict_body.include?(key) # Extract the inline array token after key, then rebuild arr_tok = value_token_after(key, dict_body) return nil unless arr_tok && arr_tok.start_with?("[") new_arr_tok = add_ref_to_array(arr_tok, ref) dict_body.sub(arr_tok) { |_| new_arr_tok } end |
.appearance_choice_for(new_value, dict_src) ⇒ Object
343 344 345 346 347 348 349 350 351 352 353 354 |
# File 'lib/corp_pdf/dict_scan.rb', line 343 def appearance_choice_for(new_value, dict_src) # If /AP << /N << /Yes ... /Off ... >> >> exists, return /Yes or /Off return nil unless dict_src.include?("/AP") # Simplistic detection yes = dict_src.include?("/Yes") off = dict_src.include?("/Off") case new_value when true, :Yes, "Yes" then yes ? "/Yes" : nil when false, :Off, "Off" then off ? "/Off" : nil end end |
.decode_pdf_string(token) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/corp_pdf/dict_scan.rb', line 100 def decode_pdf_string(token) return nil unless token t = token.strip # Literal string: ( ... ) with PDF escapes and optional UTF-16BE BOM if t.start_with?("(") && t.end_with?(")") inner = t[1..-2] s = unescape_literal(inner) if s.bytesize >= 2 && s.getbyte(0) == 0xFE && s.getbyte(1) == 0xFF return s.byteslice(2, s.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8") else return s.b .force_encoding("binary") .encode("UTF-8", invalid: :replace, undef: :replace) end end # Hex string: < ... > with optional UTF-16BE BOM if t.start_with?("<") && t.end_with?(">") hex = t[1..-2].gsub(/\s+/, "") hex << "0" if hex.length.odd? bytes = [hex].pack("H*") if bytes.bytesize >= 2 && bytes.getbyte(0) == 0xFE && bytes.getbyte(1) == 0xFF return bytes.byteslice(2, bytes.bytesize - 2).force_encoding("UTF-16BE").encode("UTF-8") else return bytes.force_encoding("binary").encode("UTF-8", invalid: :replace, undef: :replace) end end # Fallback: return token as-is (names, numbers, refs, etc.) t end |
.each_dictionary(str) ⇒ Object
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/corp_pdf/dict_scan.rb', line 33 def each_dictionary(str) i = 0 while (open = str.index("<<", i)) depth = 0 j = open found = nil while j < str.length if str[j, 2] == "<<" depth += 1 j += 2 elsif str[j, 2] == ">>" depth -= 1 j += 2 if depth.zero? found = str[open...j] break end else j += 1 end end break unless found yield found i = j end end |
.encode_pdf_name(name) ⇒ Object
Encode a string as a PDF name, escaping special characters with hex encoding PDF names must escape: # ( ) < > [ ] { } / % and control characters Example: “(Two Hr) Priority 2” becomes “/#28Two Hr#29 Priority 2”
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/corp_pdf/dict_scan.rb', line 161 def encode_pdf_name(name) name_str = name.to_s # Remove leading / if present (we'll add it back) name_str = name_str[1..] if name_str.start_with?("/") # Transliterate special characters to ASCII to avoid encoding issues ascii_name = transliterate_to_ascii(name_str) # Encode special characters as hex encoded = ascii_name.each_byte.map do |byte| char = byte.chr # PDF name special characters that need hex encoding: # ( ) < > [ ] { } / % # Also encode control characters (0x00-0x1F, 0x7F) and non-ASCII (0x80-0xFF) if ["#", "(", ")", "<", ">", "[", "]", "{", "}", "/", "%"].include?(char) || byte.between?(0x00, 0x1F) || byte == 0x7F || byte.between?(0x80, 0xFF) # Hex encode: # followed by 2-digit hex "##{byte.to_s(16).upcase.rjust(2, '0')}" else # Regular printable ASCII: use as-is char end end.join "/#{encoded}" end |
.encode_pdf_string(val) ⇒ Object
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/corp_pdf/dict_scan.rb', line 134 def encode_pdf_string(val) case val when true then "true" when false then "false" when Symbol "/#{val}" when String # Transliterate special characters to ASCII to avoid encoding issues ascii_val = transliterate_to_ascii(val) if ascii_val.ascii_only? "(#{ascii_val.gsub(/([\\()])/, '\\\\\\1').gsub("\n", '\\n')})" else # Ensure string is in UTF-8 before encoding to UTF-16BE utf8_str = ascii_val.encode("UTF-8", invalid: :replace, undef: :replace) utf16 = utf8_str.encode("UTF-16BE") bytes = "\xFE\xFF#{utf16}" "<#{bytes.unpack1('H*')}>" end else val.to_s end end |
.format_pdf_key(key) ⇒ Object
Format a metadata key as a PDF dictionary key (ensure it starts with /)
188 189 190 191 |
# File 'lib/corp_pdf/dict_scan.rb', line 188 def format_pdf_key(key) key_str = key.to_s key_str.start_with?("/") ? key_str : "/#{key_str}" end |
.format_pdf_value(value) ⇒ Object
Format a metadata value appropriately for PDF
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
# File 'lib/corp_pdf/dict_scan.rb', line 194 def format_pdf_value(value) case value when Integer, Float value.to_s when String # If it looks like a PDF string (starts with parenthesis or angle bracket), use as-is if value.start_with?("(") || value.start_with?("<") || value.start_with?("/") value else # Otherwise encode as a PDF string encode_pdf_string(value) end when Array # Array format: [item1 item2 item3] items = value.map { |v| format_pdf_value(v) }.join(" ") "[#{items}]" when Hash # Dictionary format: << /Key1 value1 /Key2 value2 >> dict = value.map do |k, v| pdf_key = format_pdf_key(k) pdf_val = format_pdf_value(v) " #{pdf_key} #{pdf_val}" end.join("\n") "<<\n#{dict}\n>>" else value.to_s end end |
.is_multiline_field?(dict_body) ⇒ Boolean
Check if a field is multiline by checking /Ff flag bit 12 (0x1000)
417 418 419 420 421 422 423 424 425 426 |
# File 'lib/corp_pdf/dict_scan.rb', line 417 def is_multiline_field?(dict_body) return false unless dict_body ff_tok = value_token_after("/Ff", dict_body) return false unless ff_tok ff_value = ff_tok.to_i # Bit 12 (0x1000) indicates multiline text field ff_value.anybits?(0x1000) end |
.is_page?(body) ⇒ Boolean
Check if a body represents a page object (not /Type/Pages)
410 411 412 413 414 |
# File 'lib/corp_pdf/dict_scan.rb', line 410 def is_page?(body) return false unless body body.include?("/Type /Page") || body =~ %r{/Type\s*/Page(?!s)\b} end |
.is_widget?(body) ⇒ Boolean
403 404 405 406 407 |
# File 'lib/corp_pdf/dict_scan.rb', line 403 def (body) return false unless body body.include?("/Subtype") && body.include?("/Widget") && body =~ %r{/Subtype\s*/Widget} end |
.parse_box(body, box_type) ⇒ Object
Parse a box array (MediaBox, CropBox, ArtBox, BleedBox, TrimBox, etc.) Returns a hash with keys :llx, :lly, :urx, :ury, or nil if not found/invalid
430 431 432 433 434 435 436 437 438 439 |
# File 'lib/corp_pdf/dict_scan.rb', line 430 def parse_box(body, box_type) pattern = %r{/#{box_type}\s*\[(.*?)\]} return nil unless body =~ pattern box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f) return nil unless box_values.length == 4 llx, lly, urx, ury = box_values { llx: llx, lly: lly, urx: urx, ury: ury } end |
.remove_appearance_stream(dict_body) ⇒ Object
Remove /AP (appearance stream) entry from a dictionary
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 |
# File 'lib/corp_pdf/dict_scan.rb', line 442 def remove_appearance_stream(dict_body) return dict_body unless dict_body&.include?("/AP") # Find /AP entry using pattern matching ap_key_pattern = %r{/AP(?=[\s(<\[/])} ap_match = dict_body.match(ap_key_pattern) return dict_body unless ap_match key_end = ap_match.end(0) value_start = key_end value_start += 1 while value_start < dict_body.length && dict_body[value_start] =~ /\s/ return dict_body if value_start >= dict_body.length # Determine what type of value we have first_char = dict_body[value_start] value_end = value_start if first_char == "<" && value_start + 1 < dict_body.length && dict_body[value_start + 1] == "<" # Inline dictionary: /AP << ... >> # Need to find matching closing >> depth = 0 i = value_start while i < dict_body.length if dict_body[i, 2] == "<<" depth += 1 i += 2 elsif dict_body[i, 2] == ">>" depth -= 1 i += 2 if depth.zero? value_end = i break end else i += 1 end end elsif ["(", "<", "["].include?(first_char) # Use value_token_after to get the complete token ap_tok = value_token_after("/AP", dict_body) return dict_body unless ap_tok value_end = value_start + ap_tok.length else # Reference or other simple token ap_tok = value_token_after("/AP", dict_body) return dict_body unless ap_tok value_end = value_start + ap_tok.length end # Skip trailing whitespace after the value value_end += 1 while value_end < dict_body.length && dict_body[value_end] =~ /\s/ # Find the start of /AP (may need to remove preceding space/newline) removal_start = ap_match.begin(0) # Try to remove preceding whitespace/newline if it's on its own line if removal_start.positive? && dict_body[removal_start - 1] == "\n" # Check if there's whitespace before the newline we should remove too line_start = removal_start - 1 line_start -= 1 while line_start.positive? && dict_body[line_start - 1] =~ /\s/ # Only remove the line if it starts with whitespace (indentation) if line_start.positive? && dict_body[line_start - 1] == "\n" removal_start = line_start end end # Build result without /AP entry before = dict_body[0...removal_start] after = dict_body[value_end..] result = "#{before}#{after}" # Verify the result still has valid dictionary structure unless result.include?("<<") && result.include?(">>") return dict_body # Return original if corrupted end result end |
.remove_ref_from_array(array_body, ref) ⇒ Object
356 357 358 359 |
# File 'lib/corp_pdf/dict_scan.rb', line 356 def remove_ref_from_array(array_body, ref) num, gen = ref array_body.gsub(/\b#{num}\s+#{gen}\s+R\b/, "").gsub(/\[\s+/, "[").gsub(/\s+\]/, "]") end |
.remove_ref_from_inline_array(dict_body, key, ref) ⇒ Object
382 383 384 385 386 387 388 389 390 |
# File 'lib/corp_pdf/dict_scan.rb', line 382 def remove_ref_from_inline_array(dict_body, key, ref) return nil unless dict_body.include?(key) # Extract the inline array token after key, then rebuild arr_tok = value_token_after(key, dict_body) return nil unless arr_tok && arr_tok.start_with?("[") dict_body.sub(arr_tok) { |t| remove_ref_from_array(t, ref) } end |
.replace_key_value(dict_src, key, new_token) ⇒ Object
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 |
# File 'lib/corp_pdf/dict_scan.rb', line 297 def replace_key_value(dict_src, key, new_token) # Replace existing key's value token in a single dictionary source string (<<...>>) # Use precise position-based replacement to avoid any regex issues # Find the key position using pattern matching key_pattern = %r{#{Regexp.escape(key)}(?=[\s(<\[/])} key_match = dict_src.match(key_pattern) return upsert_key_value(dict_src, key, new_token) unless key_match # Get the existing value token tok = value_token_after(key, dict_src) return upsert_key_value(dict_src, key, new_token) unless tok # Find exact positions key_match.begin(0) key_end = key_match.end(0) # Skip whitespace after key value_start = key_end value_start += 1 while value_start < dict_src.length && dict_src[value_start] =~ /\s/ # Verify the token matches at this position unless value_start < dict_src.length && dict_src[value_start, tok.length] == tok # Token doesn't match - fallback to upsert return upsert_key_value(dict_src, key, new_token) end # Replace using precise string slicing - this preserves everything exactly before = dict_src[0...value_start] after = dict_src[(value_start + tok.length)..] result = "#{before}#{new_token}#{after}" # Verify the result still has valid dictionary structure unless result.include?("<<") && result.include?(">>") # Dictionary corrupted - return original return dict_src end result end |
.strip_stream_bodies(pdf) ⇒ Object
— low-level string helpers ————————————————-
29 30 31 |
# File 'lib/corp_pdf/dict_scan.rb', line 29 def strip_stream_bodies(pdf) pdf.gsub(/stream\r?\n.*?endstream/mi) { "stream\nENDSTREAM_STRIPPED\nendstream" } end |
.transliterate_to_ascii(str) ⇒ Object
Transliterate a string to ASCII, converting special characters to their ASCII equivalents Example: “María Valentina” -> “Maria Valentina”
12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/corp_pdf/dict_scan.rb', line 12 def transliterate_to_ascii(str) return str unless str.is_a?(String) # Ensure the string is in UTF-8 encoding utf8_str = str.encode("UTF-8", invalid: :replace, undef: :replace) # Use I18n transliteration to convert to ASCII begin I18n.transliterate(utf8_str, locale: :en, replacement: "") rescue StandardError # Fallback: if transliteration fails, try to encode to ASCII with replacements utf8_str.encode("ASCII", invalid: :replace, undef: :replace) end end |
.unescape_literal(s) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/corp_pdf/dict_scan.rb', line 61 def unescape_literal(s) out = +"" i = 0 while i < s.length ch = s[i] if ch == "\\" i += 1 break if i >= s.length esc = s[i] case esc when "n" then out << "\n" when "r" then out << "\r" when "t" then out << "\t" when "b" then out << "\b" when "f" then out << "\f" when "\\", "(", ")" then out << esc when /\d/ oct = esc if i + 1 < s.length && s[i + 1] =~ /\d/ i += 1 oct << s[i] if i + 1 < s.length && s[i + 1] =~ /\d/ i += 1 oct << s[i] end end out << oct.to_i(8).chr else out << esc end else out << ch end i += 1 end out end |
.upsert_key_value(dict_src, key, token) ⇒ Object
338 339 340 341 |
# File 'lib/corp_pdf/dict_scan.rb', line 338 def upsert_key_value(dict_src, key, token) # Insert right after '<<' with a space between key and value dict_src.sub("<<") { |_| "<<#{key} #{token}" } end |
.value_token_after(key, dict_src) ⇒ Object
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
# File 'lib/corp_pdf/dict_scan.rb', line 223 def value_token_after(key, dict_src) # Find key followed by delimiter (whitespace, (, <, [, /) # Use regex to ensure key is a complete token match = dict_src.match(%r{#{Regexp.escape(key)}(?=[\s(<\[/])}) return nil unless match i = match.end(0) i += 1 while i < dict_src.length && dict_src[i] =~ /\s/ return nil if i >= dict_src.length case dict_src[i] when "(" depth = 0 j = i while j < dict_src.length ch = dict_src[j] if ch == "\\" j += 2 next end depth += 1 if ch == "(" if ch == ")" depth -= 1 if depth.zero? j += 1 return dict_src[i...j] end end j += 1 end nil when "<" if dict_src[i, 2] == "<<" "<<" else j = dict_src.index(">", i) j ? dict_src[i..j] : nil end when "[" # Array token - find matching closing bracket depth = 0 j = i while j < dict_src.length ch = dict_src[j] if ch == "[" depth += 1 elsif ch == "]" depth -= 1 if depth.zero? j += 1 return dict_src[i...j] end end j += 1 end nil when "/" # PDF name token - extract until whitespace or delimiter j = i while j < dict_src.length ch = dict_src[j] # PDF names can contain most characters except NUL, whitespace, and delimiters break if ch =~ /[\s<>\[\]()]/ || (ch == "/" && j > i) j += 1 end j > i ? dict_src[i...j] : nil else # atom m = %r{\A([^\s<>\[\]()/%]+)}.match(dict_src[i..]) m ? m[1] : nil end end |