Class: PDF::Reader::Encoding::WinAnsiEncoding

Inherits:

PDF::Reader::Encoding

Object
PDF::Reader::Encoding
PDF::Reader::Encoding::WinAnsiEncoding

show all

Defined in:: lib/pdf/reader/encoding.rb

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

#to_utf8(str, tounicode = nil) ⇒ Object

convert a WinAnsiEncoding string into UTF-8.

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ `Object`

convert a WinAnsiEncoding string into UTF-8

# File 'lib/pdf/reader/encoding.rb', line 776

def to_utf8(str, tounicode = nil)
  # content of this method borrowed from REXML::Encoding.decode_cp1252
  # for further reading:
  # http://www.intertwingly.net/stories/2004/04/14/i18n.html
  array_latin9 = str.unpack('C*')
  array_latin9 = self.process_differences(array_latin9)
  array_enc = []
  array_latin9.each do |num|
    if tounicode && (code = tounicode.decode(num))
      array_enc << code
    elsif tounicode
      array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
    else
      case num
        # characters that added compared to iso-8859-1
      when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac
      when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a
      when 0x83; array_enc << 0x0192 # 0xc6 0x92
      when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e
      when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6
      when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0
      when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1
      when 0x88; array_enc << 0x02C6 # 0xcb 0x86
      when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0
      when 0x8A; array_enc << 0x0160 # 0xc5 0xa0
      when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9
      when 0x8C; array_enc << 0x0152 # 0xc5 0x92
      when 0x8E; array_enc << 0x017D # 0xc5 0xbd
      when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98
      when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99
      when 0x93; array_enc << 0x201C
      when 0x94; array_enc << 0x201D
      when 0x95; array_enc << 0x2022
      when 0x96; array_enc << 0x2013
      when 0x97; array_enc << 0x2014
      when 0x98; array_enc << 0x02DC
      when 0x99; array_enc << 0x2122
      when 0x9A; array_enc << 0x0161
      when 0x9B; array_enc << 0x203A
      when 0x9C; array_enc << 0x0152 # 0xc5 0x93
      when 0x9E; array_enc << 0x017E # 0xc5 0xbe
      when 0x9F; array_enc << 0x0178
      else
        array_enc << num
      end
    end
  end

  # convert any glyph names to unicode codepoints
  array_enc = self.process_glyphnames(array_enc)

  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end

Class: PDF::Reader::Encoding::WinAnsiEncoding

Constant Summary

Constants inherited from PDF::Reader::Encoding

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ Object

#to_utf8(str, tounicode = nil) ⇒ `Object`