Module: REXML::Encoding

Included in:
Output, Source, XMLDecl
Defined in:
lib/rexml/encoding.rb,
lib/rexml/encodings/ICONV.rb,
lib/rexml/encodings/UNILE.rb,
lib/rexml/encodings/UTF-8.rb,
lib/rexml/encodings/EUC-JP.rb,
lib/rexml/encodings/CP-1252.rb,
lib/rexml/encodings/US-ASCII.rb,
lib/rexml/encodings/SHIFT-JIS.rb,
lib/rexml/encodings/ISO-8859-1.rb,
lib/rexml/encodings/ISO-8859-15.rb

Constant Summary collapse

UTF_8 =

Native, default format is UTF-8, so it is declared here rather than in an encodings/ definition.

'UTF-8'
UTF_16 =
'UTF-16'
UNILE =
'UNILE'
@@__REXML_encoding_methods =
%q~
# Convert from UTF-8
def to_iso_8859_15 content
  array_utf8 = content.unpack('U*')
  array_enc = []
  array_utf8.each do |num|
    case num
      # shortcut first bunch basic characters
    when 0..0xA3: array_enc << num
      # characters removed compared to iso-8859-1
    when 0xA4: array_enc << '&#164;'
    when 0xA6: array_enc << '&#166;'
    when 0xA8: array_enc << '&#168;'
    when 0xB4: array_enc << '&#180;'
    when 0xB8: array_enc << '&#184;'
    when 0xBC: array_enc << '&#188;'
    when 0xBD: array_enc << '&#189;'
    when 0xBE: array_enc << '&#190;'
      # characters added compared to iso-8859-1
    when 0x20AC: array_enc << 0xA4 # 0xe2 0x82 0xac
    when 0x0160: array_enc << 0xA6 # 0xc5 0xa0
    when 0x0161: array_enc << 0xA8 # 0xc5 0xa1
    when 0x017D: array_enc << 0xB4 # 0xc5 0xbd
    when 0x017E: array_enc << 0xB8 # 0xc5 0xbe
    when 0x0152: array_enc << 0xBC # 0xc5 0x92
    when 0x0153: array_enc << 0xBD # 0xc5 0x93
    when 0x0178: array_enc << 0xBE # 0xc5 0xb8
    else
      # all remaining basic characters can be used directly
      if num <= 0xFF
        array_enc << num
      else
        # Numeric entity (&#nnnn;); shard by  Stefan Scholl
        array_enc.concat "&\##{num};".unpack('C*')
      end
    end
  end
  array_enc.pack('C*')
end

# Convert to UTF-8
def from_iso_8859_15(str)
  array_latin9 = str.unpack('C*')
  array_enc = []
  array_latin9.each do |num|
    case num
      # characters that differ compared to iso-8859-1
    when 0xA4: array_enc << 0x20AC
    when 0xA6: array_enc << 0x0160
    when 0xA8: array_enc << 0x0161
    when 0xB4: array_enc << 0x017D
    when 0xB8: array_enc << 0x017E
    when 0xBC: array_enc << 0x0152
    when 0xBD: array_enc << 0x0153
    when 0xBE: array_enc << 0x0178
    else
      array_enc << num
    end
  end
  array_enc.pack('U*')
end
~

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#encodingObject

ID —> Encoding name



22
23
24
# File 'lib/rexml/encoding.rb', line 22

def encoding
  @encoding
end

Class Method Details

.apply(obj, enc) ⇒ Object



8
9
10
# File 'lib/rexml/encoding.rb', line 8

def self.apply(obj, enc)
  @encoding_methods[enc][obj]
end

.encoding_method(enc) ⇒ Object



11
12
13
# File 'lib/rexml/encoding.rb', line 11

def self.encoding_method(enc)
  @encoding_methods[enc]
end

.register(enc, &block) ⇒ Object



5
6
7
# File 'lib/rexml/encoding.rb', line 5

def self.register(enc, &block)
  @encoding_methods[enc] = block
end

Instance Method Details

#check_encoding(str) ⇒ Object



57
58
59
60
61
62
63
64
# File 'lib/rexml/encoding.rb', line 57

def check_encoding str
  # We have to recognize UTF-16, LSB UTF-16, and UTF-8
  return UTF_16 if /\A\xfe\xff/n =~ str
  return UNILE if /\A\xff\xfe/n =~ str
  str =~ /^\s*<?xml\s*version\s*=\s*(['"]).*?\2\s*encoding\s*=\s*(["'])(.*?)\2/um
  return $1.upcase if $1
  return UTF_8
end

#decode_ascii(str) ⇒ Object

Convert to UTF-8



19
20
21
# File 'lib/rexml/encodings/US-ASCII.rb', line 19

def decode_ascii(str)
  str.unpack('C*').pack('U*')
end

#decode_iconv(str) ⇒ Object



6
7
8
# File 'lib/rexml/encodings/ICONV.rb', line 6

def decode_iconv(str)
  Iconv.conv(UTF_8, @encoding, str)
end

#decode_unile(str) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/rexml/encodings/UNILE.rb', line 18

def decode_unile(str)
  array_enc=str.unpack('C*')
  array_utf8 = []
  0.step(array_enc.size-1, 2){|i| 
    array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100)
  }
  array_utf8.pack('U*')
end

#decode_utf8(str) ⇒ Object



7
8
9
# File 'lib/rexml/encodings/UTF-8.rb', line 7

def decode_utf8(str)
  str
end

#encode_ascii(content) ⇒ Object

Convert from UTF-8



4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/rexml/encodings/US-ASCII.rb', line 4

def encode_ascii content
  array_utf8 = content.unpack('U*')
  array_enc = []
  array_utf8.each do |num|
    if num <= 0x7F
      array_enc << num
    else
      # Numeric entity (&#nnnn;); shard by  Stefan Scholl
      array_enc.concat "&\##{num};".unpack('C*')
    end
  end
  array_enc.pack('C*')
end

#encode_iconv(content) ⇒ Object



10
11
12
# File 'lib/rexml/encodings/ICONV.rb', line 10

def encode_iconv(content)
  Iconv.conv(@encoding, UTF_8, content)
end

#encode_unile(content) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/rexml/encodings/UNILE.rb', line 3

def encode_unile content
  array_utf8 = content.unpack("U*")
  array_enc = []
  array_utf8.each do |num|
    if ((num>>16) > 0)
      array_enc << ??
      array_enc << 0
    else
      array_enc << (num & 0xFF)
      array_enc << (num >> 8)
    end
  end
  array_enc.pack('C*')
end

#encode_utf8(content) ⇒ Object



3
4
5
# File 'lib/rexml/encodings/UTF-8.rb', line 3

def encode_utf8 content
  content
end