Class: PDF::Reader::Encoding::IdentityH

Inherits:

PDF::Reader::Encoding

Object
PDF::Reader::Encoding
PDF::Reader::Encoding::IdentityH

show all

Defined in:: lib/pdf/reader/encoding.rb

Constant Summary

Constants inherited from PDF::Reader::Encoding

UNKNOWN_CHAR

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

#differences

Instance Method Summary collapse

#to_utf8(str, tounicode = nil) ⇒ Object

Methods inherited from PDF::Reader::Encoding

factory

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ `Object`

# File 'lib/pdf/reader/encoding.rb', line 107

def to_utf8(str, tounicode = nil)

  array_enc = []

  # iterate over string, reading it in 2 byte chunks and interpreting those
  # chunks as ints
  str.unpack("n*").each do |num|

    # convert the int to a unicode codepoint if possible.
    # without a ToUnicode CMap, it's impossible to reliably convert this text
    # to unicode, so just replace each character with a little box. Big smacks
    # the the PDF producing app.
    if tounicode && (code = tounicode.decode(num))
      array_enc << code
    else
      array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
    end
  end

  # replace charcters that didn't convert to unicode nicely with something valid
  array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

  # pack all our Unicode codepoints into a UTF-8 string
  ret = array_enc.pack("U*")

  # set the strings encoding correctly under ruby 1.9+
  ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

  return ret
end

Class: PDF::Reader::Encoding::IdentityH

Constant Summary

Constants inherited from PDF::Reader::Encoding

Instance Attribute Summary

Attributes inherited from PDF::Reader::Encoding

Instance Method Summary collapse

Methods inherited from PDF::Reader::Encoding

Instance Method Details

#to_utf8(str, tounicode = nil) ⇒ Object

#to_utf8(str, tounicode = nil) ⇒ `Object`