Class: PDF::Reader::Encoding::MacRomanEncoding

Inherits:
PDF::Reader::Encoding show all
Defined in:
lib/pdf/reader/encoding.rb

Overview

The default encoding for OSX <= v9 see: en.wikipedia.org/wiki/Mac_OS_Roman

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ Object

convert a MacRomanEncoding string into UTF-8



331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
# File 'lib/pdf/reader/encoding.rb', line 331

def to_utf8(str, tounicode = nil)
  # content of this method borrowed from REXML::Encoding.decode_cp1252
  array_mac = str.unpack('C*')
  array_mac = self.process_differences(array_mac)
  array_enc = []
  array_mac.each do |num|
    case num
      # change necesary characters to equivilant Unicode codepoints
    when 0x80; array_enc << 0x00C4
    when 0x81; array_enc << 0x00C5
    when 0x82; array_enc << 0x00C7
    when 0x83; array_enc << 0x00C9
    when 0x84; array_enc << 0x00D1
    when 0x85; array_enc << 0x00D6
    when 0x86; array_enc << 0x00DC
    when 0x87; array_enc << 0x00E1
    when 0x88; array_enc << 0x00E0
    when 0x89; array_enc << 0x00E2
    when 0x8A; array_enc << 0x00E4
    when 0x8B; array_enc << 0x00E3
    when 0x8C; array_enc << 0x00E5
    when 0x8D; array_enc << 0x00E7
    when 0x8E; array_enc << 0x00E9
    when 0x8F; array_enc << 0x00E8
    when 0x90; array_enc << 0x00EA
    when 0x91; array_enc << 0x00EB
    when 0x92; array_enc << 0x00ED
    when 0x93; array_enc << 0x00EC
    when 0x94; array_enc << 0x00EE
    when 0x95; array_enc << 0x00EF
    when 0x96; array_enc << 0x00F1
    when 0x97; array_enc << 0x00F3
    when 0x98; array_enc << 0x00F2
    when 0x99; array_enc << 0x00F4
    when 0x9A; array_enc << 0x00F6
    when 0x9B; array_enc << 0x00F5
    when 0x9C; array_enc << 0x00FA
    when 0x9D; array_enc << 0x00F9
    when 0x9E; array_enc << 0x00FB
    when 0x9F; array_enc << 0x00FC
    when 0xA0; array_enc << 0x2020
    when 0xA1; array_enc << 0x00B0
    when 0xA2; array_enc << 0x00A2
    when 0xA3; array_enc << 0x00A3
    when 0xA4; array_enc << 0x00A7
    when 0xA5; array_enc << 0x2022
    when 0xA6; array_enc << 0x00B6
    when 0xA7; array_enc << 0x00DF
    when 0xA8; array_enc << 0x00AE
    when 0xA9; array_enc << 0x00A9
    when 0xAA; array_enc << 0x2122
    when 0xAB; array_enc << 0x00B4
    when 0xAC; array_enc << 0x00A8
    when 0xAD; array_enc << 0x2260
    when 0xAE; array_enc << 0x00C6
    when 0xAF; array_enc << 0x00D8
    when 0xB0; array_enc << 0x221E
    when 0xB1; array_enc << 0x00B1
    when 0xB2; array_enc << 0x2264
    when 0xB3; array_enc << 0x2265
    when 0xB4; array_enc << 0x00A5
    when 0xB5; array_enc << 0x00B5
    when 0xB6; array_enc << 0x2202
    when 0xB7; array_enc << 0x2211
    when 0xB8; array_enc << 0x220F
    when 0xB9; array_enc << 0x03C0
    when 0xBA; array_enc << 0x222B
    when 0xBB; array_enc << 0x00AA
    when 0xBC; array_enc << 0x00BA
    when 0xBD; array_enc << 0x03A9
    when 0xBE; array_enc << 0x00E6
    when 0xBF; array_enc << 0x00F8
    when 0xC0; array_enc << 0x00BF
    when 0xC1; array_enc << 0x00A1
    when 0xC2; array_enc << 0x00AC
    when 0xC3; array_enc << 0x221A
    when 0xC4; array_enc << 0x0192
    when 0xC5; array_enc << 0x2248
    when 0xC6; array_enc << 0x2206
    when 0xC7; array_enc << 0x00AB
    when 0xC8; array_enc << 0x00BB
    when 0xC9; array_enc << 0x2026
    when 0xCA; array_enc << 0x00A0
    when 0xCB; array_enc << 0x00C0
    when 0xCC; array_enc << 0x00C3
    when 0xCD; array_enc << 0x00D5
    when 0xCE; array_enc << 0x0152
    when 0xCF; array_enc << 0x0153
    when 0xD0; array_enc << 0x2013
    when 0xD1; array_enc << 0x2014
    when 0xD2; array_enc << 0x201C
    when 0xD3; array_enc << 0x201D
    when 0xD4; array_enc << 0x2018
    when 0xD5; array_enc << 0x2019
    when 0xD6; array_enc << 0x00F7
    when 0xD7; array_enc << 0x25CA
    when 0xD8; array_enc << 0x00FF
    when 0xD9; array_enc << 0x0178
    when 0xDA; array_enc << 0x2044
    when 0xDB; array_enc << 0x20AC
    when 0xDC; array_enc << 0x2039
    when 0xDD; array_enc << 0x203A
    when 0xDE; array_enc << 0xFB01
    when 0xDF; array_enc << 0xFB02
    when 0xE0; array_enc << 0x2021
    when 0xE1; array_enc << 0x00B7
    when 0xE2; array_enc << 0x201A
    when 0xE3; array_enc << 0x201E
    when 0xE4; array_enc << 0x2030
    when 0xE5; array_enc << 0x00C2
    when 0xE6; array_enc << 0x00CA
    when 0xE7; array_enc << 0x00C1
    when 0xE8; array_enc << 0x00CB
    when 0xE9; array_enc << 0x00C8
    when 0xEA; array_enc << 0x00CD
    when 0xEB; array_enc << 0x00CE
    when 0xEC; array_enc << 0x00CF
    when 0xED; array_enc << 0x00CC
    when 0xEE; array_enc << 0x00D3
    when 0xEF; array_enc << 0x00D4
    when 0xF0; array_enc << 0xF8FF
    when 0xF1; array_enc << 0x00D2
    when 0xF2; array_enc << 0x00DA
    when 0xF3; array_enc << 0x00D8
    when 0xF4; array_enc << 0x00D9
    when 0xF5; array_enc << 0x0131
    when 0xF6; array_enc << 0x02C6
    when 0xF7; array_enc << 0x02DC
    when 0xF8; array_enc << 0x00AF
    when 0xF9; array_enc << 0x02D8
    when 0xFA; array_enc << 0x02D9
    when 0xFB; array_enc << 0x02DA
    when 0xFC; array_enc << 0x00B8
    when 0xFD; array_enc << 0x02DD
    when 0xFE; array_enc << 0x02DB
    when 0xFF; array_enc << 0x02C7
    else
      array_enc << num
    end
  end

  # convert any glyph names to unicode codepoints
  array_enc = self.process_glyphnames(array_enc)

  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
  
  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end