Class: PDF::Reader::Encoding::WinAnsiEncoding

Inherits:

PDF::Reader::Encoding

Object
PDF::Reader::Encoding
PDF::Reader::Encoding::WinAnsiEncoding

show all

Defined in:: lib/pdf/reader/encoding.rb

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

#to_utf8(str, tounicode = nil) ⇒ Object

convert a WinAnsiEncoding string into UTF-8.

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ `Object`

convert a WinAnsiEncoding string into UTF-8

# File 'lib/pdf/reader/encoding.rb', line 751

def to_utf8(str, tounicode = nil)
  # content of this method borrowed from REXML::Encoding.decode_cp1252
  # for further reading:
  # http://www.intertwingly.net/stories/2004/04/14/i18n.html
  array_latin9 = str.unpack('C*')
  array_latin9 = self.process_differences(array_latin9)
  array_enc = []
  array_latin9.each do |num|
    case num
      # characters that added compared to iso-8859-1
    when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac
    when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a
    when 0x83; array_enc << 0x0192 # 0xc6 0x92
    when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e
    when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6
    when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0
    when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1
    when 0x88; array_enc << 0x02C6 # 0xcb 0x86
    when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0
    when 0x8A; array_enc << 0x0160 # 0xc5 0xa0
    when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9
    when 0x8C; array_enc << 0x0152 # 0xc5 0x92
    when 0x8E; array_enc << 0x017D # 0xc5 0xbd
    when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98
    when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99
    when 0x93; array_enc << 0x201C
    when 0x94; array_enc << 0x201D
    when 0x95; array_enc << 0x2022
    when 0x96; array_enc << 0x2013
    when 0x97; array_enc << 0x2014
    when 0x98; array_enc << 0x02DC
    when 0x99; array_enc << 0x2122
    when 0x9A; array_enc << 0x0161
    when 0x9B; array_enc << 0x203A
    when 0x9C; array_enc << 0x0152 # 0xc5 0x93
    when 0x9E; array_enc << 0x017E # 0xc5 0xbe
    when 0x9F; array_enc << 0x0178
    else
      array_enc << num
    end
  end

  # convert any glyph names to unicode codepoints
  array_enc = self.process_glyphnames(array_enc)

  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end

Class: PDF::Reader::Encoding::WinAnsiEncoding

Constant Summary

Constants inherited from PDF::Reader::Encoding

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ Object

#to_utf8(str, tounicode = nil) ⇒ `Object`