Class: PDF::Reader::Encoding

Inherits:

Object

Object
PDF::Reader::Encoding

Defined in:: lib/pdf/reader/encoding.rb

Overview

Util class for working with string encodings in PDF files. Mostly used to convert strings of various PDF-dialect encodings into UTF-8.

Constant Summary collapse

CONTROL_CHARS = :nodoc:

[0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31]

UNKNOWN_CHAR = ▯

0x25AF

Instance Attribute Summary collapse

#unpack ⇒ Object readonly

Returns the value of attribute unpack.

Instance Method Summary collapse

#differences ⇒ Object
#differences=(diff) ⇒ Object

set the differences table for this encoding.
#initialize(enc) ⇒ Encoding constructor

A new instance of Encoding.
#int_to_name(glyph_code) ⇒ Object

convert an integer glyph code into an Adobe glyph name.
#int_to_utf8_string(glyph_code) ⇒ Object
#to_utf8(str) ⇒ Object

convert the specified string to utf8.

Constructor Details

#initialize(enc) ⇒ `Encoding`

Returns a new instance of Encoding.

# File 'lib/pdf/reader/encoding.rb', line 40

def initialize(enc)
  @mapping  = default_mapping # maps from character codes to Unicode codepoints
  @string_cache  = {} # maps from character codes to UTF-8 strings.

  @enc_name = if enc.kind_of?(Hash)
    enc[:Encoding] || enc[:BaseEncoding]
  elsif enc && enc.respond_to?(:to_sym)
    enc.to_sym
  else
    :StandardEncoding
  end

  @unpack   = get_unpack(@enc_name)
  @map_file = get_mapping_file(@enc_name)

  load_mapping(@map_file) if @map_file

  if enc.is_a?(Hash) && enc[:Differences]
    self.differences = enc[:Differences]
  end
end

Instance Attribute Details

#unpack ⇒ `Object` (readonly)

Returns the value of attribute unpack.



38
39
40

# File 'lib/pdf/reader/encoding.rb', line 38

def unpack
  @unpack
end

Instance Method Details

#differences ⇒ `Object`

# File 'lib/pdf/reader/encoding.rb', line 88

def differences
  # this method is only used by the spec tests
  @differences ||= {}
end

#differences=(diff) ⇒ `Object`

set the differences table for this encoding. should be an array in the following format:

[25, :A, 26, :B]

The array alternates between a decimal byte number and a glyph name to map to that byte

To save space the following array is also valid and equivalent to the previous one

[25, :A, :B]

# File 'lib/pdf/reader/encoding.rb', line 71

def differences=(diff)
  PDF::Reader::Error.validate_type(diff, "diff", Array)

  @differences = {}
  byte = 0
  diff.each do |val|
    if val.kind_of?(Numeric)
      byte = val.to_i
    elsif codepoint = glyphlist.name_to_unicode(val)
      @differences[byte] = val
      @mapping[byte] = codepoint
      byte += 1
    end
  end
  @differences
end

#int_to_name(glyph_code) ⇒ `Object`

convert an integer glyph code into an Adobe glyph name.

int_to_name(65)
=> [:A]

# File 'lib/pdf/reader/encoding.rb', line 121

def int_to_name(glyph_code)
  if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
    []
  elsif differences[glyph_code]
    [differences[glyph_code]]
  elsif @mapping[glyph_code]
    glyphlist.unicode_to_name(@mapping[glyph_code])
  else
    []
  end
end

#int_to_utf8_string(glyph_code) ⇒ `Object`



112
113
114

# File 'lib/pdf/reader/encoding.rb', line 112

def int_to_utf8_string(glyph_code)
  @string_cache[glyph_code] ||= internal_int_to_utf8_string(glyph_code)
end

#to_utf8(str) ⇒ `Object`

convert the specified string to utf8

unpack raw bytes into codepoints
replace any that have entries in the differences table with a glyph name
convert codepoints from source encoding to Unicode codepoints
convert any glyph names to Unicode codepoints
replace characters that didn’t convert to Unicode nicely with something valid
pack the final array of Unicode codepoints into a utf-8 string
mark the string as utf-8 if we’re running on a M17N aware VM

# File 'lib/pdf/reader/encoding.rb', line 104

def to_utf8(str)
  if utf8_conversion_impossible?
    little_boxes(str.unpack(unpack).size)
  else
    convert_to_utf8(str)
  end
end

Class: PDF::Reader::Encoding

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(enc) ⇒ Encoding

Instance Attribute Details

#unpack ⇒ Object (readonly)

Instance Method Details

#differences ⇒ Object

#differences=(diff) ⇒ Object

#int_to_name(glyph_code) ⇒ Object

#int_to_utf8_string(glyph_code) ⇒ Object