Module: TibyBytes::StringMixin

Included in:
String
Defined in:
lib/wayback_machine_downloader/tidy_bytes.rb

Instance Method Summary collapse

Instance Method Details

#tidy_bytes(force = false) ⇒ Object

Attempt to replace invalid UTF-8 bytes with valid ones. This method naively assumes if you have invalid UTF8 bytes, they are either Windows CP-1252 or ISO8859-1. In practice this isn’t a bad assumption, but may not always work.

Passing true will forcibly tidy all bytes, assuming that the string’s encoding is CP-1252 or ISO-8859-1.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 48

def tidy_bytes(force = false)

  if force
    return unpack("C*").map do |b|
      tidy_byte(b)
    end.flatten.compact.pack("C*").unpack("U*").pack("U*")
  end

  bytes = unpack("C*")
  conts_expected = 0
  last_lead = 0

  bytes.each_index do |i|

    byte          = bytes[i]
    _is_ascii     = byte < 128
    is_cont       = byte > 127 && byte < 192
    is_lead       = byte > 191 && byte < 245
    is_unused     = byte > 240
    is_restricted = byte > 244

    # Impossible or highly unlikely byte? Clean it.
    if is_unused || is_restricted
      bytes[i] = tidy_byte(byte)
    elsif is_cont
      # Not expecting contination byte? Clean up. Otherwise, now expect one less.
      conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
    else
      if conts_expected > 0
        # Expected continuation, but got ASCII or leading? Clean backwards up to
        # the leading byte.
        begin
          (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
        rescue NoMethodError
          next
        end
        conts_expected = 0
      end
      if is_lead
        # Final byte is leading? Clean it.
        if i == bytes.length - 1
          bytes[i] = tidy_byte(bytes.last)
        else
          # Valid leading byte? Expect continuations determined by position of
          # first zero bit, with max of 3.
          conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
          last_lead = i
        end
      end
    end
  end
  begin
    bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
  rescue ArgumentError
    nil
  end
end

#tidy_bytes!(force = false) ⇒ Object

Tidy bytes in-place.



107
108
109
# File 'lib/wayback_machine_downloader/tidy_bytes.rb', line 107

def tidy_bytes!(force = false)
  replace tidy_bytes(force)
end