Class: PDF::Reader::Encoding::IdentityH

Inherits:
PDF::Reader::Encoding show all
Defined in:
lib/pdf/reader/encoding.rb

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/pdf/reader/encoding.rb', line 107

def to_utf8(str, tounicode = nil)

  array_enc = []

  # iterate over string, reading it in 2 byte chunks and interpreting those
  # chunks as ints
  str.unpack("n*").each do |num|

    # convert the int to a unicode codepoint if possible.
    # without a ToUnicode CMap, it's impossible to reliably convert this text
    # to unicode, so just replace each character with a little box. Big smacks
    # the the PDF producing app.
    if tounicode && (code = tounicode.decode(num))
      array_enc << code
    else
      array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
    end
  end

  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end