Class: PDF::Reader::Encoding::IdentityH

Inherits:
PDF::Reader::Encoding show all
Defined in:
lib/pdf/reader/encoding.rb

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, map = nil) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/pdf/reader/encoding.rb', line 107

def to_utf8(str, map = nil)
  
  array_enc = []

  # iterate over string, reading it in 2 byte chunks and interpreting those
  # chunks as ints
  str.unpack("n*").each do |c|
    # convert the int to a unicode codepoint if possible.
    # without a ToUnicode CMap, it's impossible to reliably convert this text
    # to unicode, so just replace each character with a little box. Big smacks
    # the the PDF producing app.
    if map
      array_enc << map.decode(c)
    else
      array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
    end
  end
  
  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
  
  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end