Module: Unibits

Defined in:
lib/unibits.rb,
lib/unibits/version.rb

Constant Summary collapse

SUPPORTED_ENCODINGS =
Encoding.name_list.grep(
  Regexp.union(
    /^UTF-8$/,
    /^UTF8-/,
    /^UTF-...E$/,
    /^ASCII-8BIT$/,
    /^US-ASCII$/,
    /^ISO-8859-/,
    /^Windows-125/,
    /^IBM/,
    /^CP85/,
    /^mac/,
    /^TIS-620$/,
    /^Windows-874$/,
    /^KOI8/,
    /^GB1988$/,
  )
).sort.freeze
COLORS =
{
  invalid: "#FF0000",
  control: "#0000FF",
  blank: "#33AADD",
  format: "#FF00FF",
  mark: "#228822",
  unassigned: "#FF5500",
  ignorable: "#FFAA00",
}
DEFAULT_TERMINAL_WIDTH =
80
VERSION =
"2.8.0"
UNICODE_VERSION =
"12.0.0"

Class Method Summary collapse

Class Method Details

.convert_to_encoding_or_raise(string, encoding, convert) ⇒ Object

Raises:

  • (ArgumentError)


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/unibits.rb', line 48

def self.convert_to_encoding_or_raise(string, encoding, convert)
  raise ArgumentError, "no data given to unibits" if !string || string.empty?

  string = string.dup.force_encoding(encoding) if encoding
  string = string.encode(convert) if convert

  case string.encoding.name
  when *SUPPORTED_ENCODINGS
    string
  when 'UTF-16', 'UTF-32'
    raise ArgumentError, "unibits only supports #{string.encoding.name} with specified endianess, please use #{string.encoding.name}LE or #{string.encoding.name}BE"
  else
    raise ArgumentError, "unibits does not support strings of encoding #{string.encoding}"
  end
end

.determine_char_color(char_info) ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/unibits.rb', line 254

def self.determine_char_color(char_info)
  if !char_info.valid?
    COLORS[:invalid]
  elsif !char_info.assigned?
    if char_info.unicode? && char_info.ignorable?
      COLORS[:ignorable]
    else
      COLORS[:unassigned]
    end
  elsif char_info.blank?
    COLORS[:blank]
  elsif char_info.control?
    COLORS[:control]
  elsif char_info.format?
    COLORS[:format]
  elsif char_info.unicode? && char_info.category[0] == "M"
    COLORS[:mark]
  else
    random_color
  end
end

.determine_terminal_colsObject



248
249
250
251
252
# File 'lib/unibits.rb', line 248

def self.determine_terminal_cols
  STDIN.winsize[1] || DEFAULT_TERMINAL_WIDTH
rescue Errno::ENOTTY
  return DEFAULT_TERMINAL_WIDTH
end

.double_check_utf32_validness!(char, char_info) ⇒ Object



337
338
339
340
341
342
343
344
345
# File 'lib/unibits.rb', line 337

def self.double_check_utf32_validness!(char, char_info)
  byte_values = char.b.unpack("C*")
  le = char_info.encoding.name == 'UTF-32LE'
  if  byte_values[le ? 2 : 1] > 16 ||
      byte_values[le ? 3 : 0] > 0 ||
      byte_values[le ? 1 : 2] >= 216  && byte_values[le ? 1 : 2] <= 223
    char_info.instance_variable_set(:@is_valid, false)
  end
end

.highlight_bits(byte, char, char_info, current_color, byteindex) ⇒ Object



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/unibits.rb', line 280

def self.highlight_bits(byte, char, char_info, current_color, byteindex)
  bin_byte_complete = byte.to_s(2).rjust(8, "0")

  if !char_info.valid?
    bin_byte_1 = bin_byte_complete
    bin_byte_2 = ""
  else
    case char_info.encoding.name
    when 'US-ASCII'
      bin_byte_1 = bin_byte_complete[0...1]
      bin_byte_2 = bin_byte_complete[1...8]
    when 'ASCII-8BIT'
      bin_byte_1 = ""
      bin_byte_2 = bin_byte_complete
    when 'UTF-8', /^UTF8/
      if byteindex == 0
        if bin_byte_complete =~ /^(0|1{2,4}0)([01]+)$/
          bin_byte_1 = $1
          bin_byte_2 = $2
        else
          bin_byte_1 = ""
          bin_byte_2 = bin_byte_complete
        end
      else
        bin_byte_1 = bin_byte_complete[0...2]
        bin_byte_2 = bin_byte_complete[2...8]
      end
    when 'UTF-16LE'
      if char.ord <= 0xFFFF || byteindex == 0 || byteindex == 2
        bin_byte_1 = ""
        bin_byte_2 = bin_byte_complete
      else
        bin_byte_complete =~ /^(11011[01])([01]+)$/
        bin_byte_1 = $1
        bin_byte_2 = $2
      end
    when 'UTF-16BE'
      if char.ord <= 0xFFFF || byteindex == 1 || byteindex == 3
        bin_byte_1 = ""
        bin_byte_2 = bin_byte_complete
      else
        bin_byte_complete =~ /^(11011[01])([01]+)$/
        bin_byte_1 = $1
        bin_byte_2 = $2
      end
    else
      bin_byte_1 = ""
      bin_byte_2 = bin_byte_complete
    end
  end

  res = ""
  res << Paint[ bin_byte_1, current_color ]             unless !bin_byte_1 || bin_byte_1.empty?
  res << Paint[ bin_byte_2, current_color, :underline ] unless !bin_byte_2 || bin_byte_2.empty?
  res
end

.of(string, encoding: nil, convert: nil, stats: true, wide_ambiguous: false, width: nil) ⇒ Object



41
42
43
44
45
46
# File 'lib/unibits.rb', line 41

def self.of(string, encoding: nil, convert: nil, stats: true, wide_ambiguous: false, width: nil)
  string = convert_to_encoding_or_raise(string, encoding, convert)

  puts stats(string, wide_ambiguous: wide_ambiguous) if stats
  puts visualize(string, wide_ambiguous: wide_ambiguous, width: width)
end

.random_colorObject



276
277
278
# File 'lib/unibits.rb', line 276

def self.random_color
  "%.2x%.2x%.2x" % [rand(90) + 60, rand(90) + 60, rand(90) + 60]
end

.stats(string, wide_ambiguous: false) ⇒ Object



64
65
66
67
68
69
70
71
72
# File 'lib/unibits.rb', line 64

def self.stats(string, wide_ambiguous: false)
  valid      = string.valid_encoding?
  bytes      = string.bytesize rescue "?"
  codepoints = string.size rescue "?"
  glyphs     = string.scan(Regexp.compile('\X'.encode(string.encoding))).size rescue "?"
  width      = Unicode::DisplayWidth.of(string, wide_ambiguous ? 2 : 1) rescue "?"

  "\n  #{valid ? '' : Paint["Invalid ", :bold, :red]}#{Paint[string.encoding.name, :bold]} (#{bytes}/#{codepoints}/#{glyphs}/#{width})"
end

.visualize(string, wide_ambiguous: false, width: nil) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/unibits.rb', line 74

def self.visualize(string, wide_ambiguous: false, width: nil)
  cols = width || determine_terminal_cols
  encoding_name = string.encoding.name

  type = Characteristics.type_from_encoding_name(encoding_name)

  cp_buffer  = ["  "]
  enc_buffer = ["  "]
  hex_buffer = ["  "]
  bin_buffer = ["  "]
  separator  = ["  "]
  current_encoding_error = nil

  puts
  string.each_char{ |char|
    char_info = Characteristics.create_for_type(char, type)

    if  RUBY_VERSION >= "2.4.1" ||
        RUBY_VERSION < "2.4.0" && RUBY_VERSION >= "2.3.4" ||
        RUBY_VERSION < "2.3.0" && RUBY_VERSION >= "2.2.7" ||
        char_info.encoding.name[0, 6] != "UTF-32" ||
        !char_info.valid?
      # bug is fixed or not relevant
    else
      double_check_utf32_validness!(char, char_info)
    end

    current_color = determine_char_color(char_info)

    current_encoding_error = nil if char_info.valid?

    char.each_byte.with_index{ |byte, byteindex|
      if Paint.unpaint(hex_buffer[-1]).bytesize > cols - 12
        cp_buffer  << "  "
        enc_buffer << "  "
        hex_buffer << "  "
        bin_buffer << "  "
        separator  << "  "
      end

      if byteindex == 0
        if char_info.valid?
          codepoint = "U+%04X" % char.ord
        else
          case encoding_name
          when "US-ASCII"
            codepoint = "invalid"
          when "UTF-8", /^UTF8/
            # this tries to detect what is wrong with this utf-8 encoded string
            # sorry for this mess
            case char.unpack("B*")[0]
            when /^110.{5}$/
              current_encoding_error = [:nec, 1, 1]
              codepoint = "n.e.con."
            when /^1110(.{4})$/
              if $1 == "1101"
                current_encoding_error = [:nec, 2, 2, :maybe_surrogate]
              else
                current_encoding_error = [:nec, 2, 2]
              end
              codepoint = "n.e.con."
            when /^11110(.{3})$/
              case $1
              when "100"
                current_encoding_error = [:nec, 3, 3, :leading_at_max]
              when "101", "110", "111"
                current_encoding_error = [:nec, 3, 3, :too_large]
              else
                current_encoding_error = [:nec, 3, 3]
              end
              codepoint = "n.e.con."
            when /^11111.{3}$/
              codepoint = "toolarge"
            when /^10(.{2}).{4}$/
              # uglyhack to fixup that it is not n.e.c, but something different
              if current_encoding_error && current_encoding_error[0] == :nec
                if current_encoding_error[3] == :leading_at_max
                  if $1 != "00"
                    current_encoding_error[3] = :too_large
                  else
                    current_encoding_error[3] = nil
                  end
                elsif current_encoding_error[3] == :maybe_surrogate
                  if $1[0] == "1"
                    current_encoding_error[3] = :surrogate
                  else
                    current_encoding_error[3] = nil
                  end
                end

                if current_encoding_error[1] > 1
                  current_encoding_error[1] -= 1
                  codepoint = "n.e.con."
                else
                  case current_encoding_error[3]
                  when :too_large
                    actual_error = "toolarge"
                  when :surrogate
                    actual_error = "sur.gate"
                  else
                    actual_error = "overlong"
                  end
                  current_cp_buffer_index = -1
                  (current_encoding_error[2]).times{
                    if index = cp_buffer[current_cp_buffer_index].rindex("n.e.con.")
                      cp_buffer[current_cp_buffer_index][index..-1] = cp_buffer[current_cp_buffer_index][index..-1].sub("n.e.con.", actual_error)
                    else
                      current_cp_buffer_index -= 1
                      index = cp_buffer[current_cp_buffer_index].rindex("n.e.con.")
                      cp_buffer[current_cp_buffer_index][index..-1] = cp_buffer[current_cp_buffer_index][index..-1].sub("n.e.con.", actual_error)
                    end
                    current_encoding_error = [:overlong]
                    codepoint = actual_error
                  }
                end
              else
                current_encoding_error = [:unexp]
                codepoint = "unexp.c."
              end
            else
              current_encoding_error = [:invallid]
              codepoint = "invalid"
            end
          when 'UTF-16LE', 'UTF-16BE'
            if char.bytesize.odd?
              codepoint = "incompl."
            elsif char.b[encoding_name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
              codepoint = "hlf.srg."
            else
              codepoint = "invalid"
            end
          when 'UTF-32LE', 'UTF-32BE'
            if char.bytesize % 4 != 0
              codepoint = "incompl."
            elsif char.b.unpack("C*")[encoding_name == 'UTF-32LE' ? 2 : 1] > 16 ||
                  char.b.unpack("C*")[encoding_name == 'UTF-32LE' ? 3 : 0] > 0
              codepoint = "toolarge"
            else
              codepoint = "sur.gate"
            end
          end
        end

        cp_buffer[-1] << Paint[ codepoint.ljust(10), current_color, :bold ]

        symbolified_char = Symbolify.symbolify(char, char_info)

        if char_info.unicode?
          padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
        else
          padding = 10 - symbolified_char.size
        end

        enc_buffer[-1] << Paint[ symbolified_char, current_color ]
        enc_buffer[-1] << " " * padding if padding > 0
      else
        cp_buffer[-1]  << " " * 10
        enc_buffer[-1] << " " * 10
      end

      hex_buffer[-1] << Paint[ ("%02X" % byte).ljust(10, " "), current_color ]

      bin_buffer[-1] << highlight_bits(byte, char, char_info, current_color, byteindex)
      bin_buffer[-1] << "  "
    }
  }

  if type == :unicode
    enc_buffer.zip(cp_buffer, hex_buffer, bin_buffer, separator).flatten.join("\n")
  else
    enc_buffer.zip(hex_buffer, bin_buffer, separator).flatten.join("\n")
  end
end