Class: WTF8Fixer

Inherits:
Object
  • Object
show all
Defined in:
lib/wtf8-fixer.rb

Class Method Summary collapse

Class Method Details

.fix(string) ⇒ Object

Fixes strings, converts to UTF-8 even if mixed (ISO-8859-1 + UTF-8)

Example:

>> WTF8Fixer.fix('café ' + 'café'.encode('iso-8859-1').force_encoding('utf-8'))
=> 'café café'

Arguments:

string: (String)


12
13
14
# File 'lib/wtf8-fixer.rb', line 12

def self.fix(string)
  fix_bytes! string.bytes
end

.fix_bytes!(input) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/wtf8-fixer.rb', line 18

def self.fix_bytes!(input)
  buffer = []
  while input.length > 0
    n = num_bits input[0]
    if n <= 1 || n > input.length
      data = iso_to_unicode! input
    else
      data = unicode_to_unicode! input
      if data.size == 0
        data = iso_to_unicode! input
      end
    end
    buffer.push *data
  end

  buffer.pack('C*').force_encoding('utf-8')
end

.iso_to_unicode!(input) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
# File 'lib/wtf8-fixer.rb', line 36

def self.iso_to_unicode!(input)
  current_byte = (input.shift()) & 0xff
  if (current_byte & 0x80) == 0
    current_byte
  else
    [
      0xc0 | ((current_byte >> 6) & 0x1f),
      0x80 | (current_byte & 0x3f)
    ]
  end
end

.num_bits(b) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/wtf8-fixer.rb', line 61

def self.num_bits(b)
  if (b & 0x80) == 0
    0
  elsif (b & 0xC0) == 0x80
    1
  elsif (b & 0xE0) == 0xC0
    2
  elsif (b & 0xF0) == 0xE0
    3
  elsif (b & 0xF8) == 0xF0
    4
  else
    0
  end
end

.unicode_to_unicode!(input) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/wtf8-fixer.rb', line 48

def self.unicode_to_unicode!(input)
  n = num_bits input[0]
  if n == 0
    return input.shift
  elsif n == 1
    return []
  elsif input[1 ... n].any? { |item| (item & 0xc0) != 0x80 }
    return []
  else
    return input.shift(n)
  end
end