Class: CharDet::UTF1632Prober

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/utf1632prober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters

Constructor Details

#initializeUTF1632Prober

Returns a new instance of UTF1632Prober.



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/rchardet/utf1632prober.rb', line 34

def initialize
  super()
  @position = 0
  @zeros_at_mod = [0, 0, 0, 0]
  @nonzeros_at_mod = [0, 0, 0, 0]
  @state = EDetecting
  @quad = [0, 0, 0, 0]
  @invalid_utf16be = false
  @invalid_utf16le = false
  @invalid_utf32be = false
  @invalid_utf32le = false
  @first_half_surrogate_pair_detected_16be = false
  @first_half_surrogate_pair_detected_16le = false
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/rchardet/utf1632prober.rb', line 82

def feed(aBuf)
  aBuf.each_byte do |b|
    mod4 = @position % 4
    @quad[mod4] = b
    if mod4 == 3
      validate_utf32_characters(@quad)
      validate_utf16_characters(@quad[0..2])
      validate_utf16_characters(@quad[2..4])
    end
    if b == 0
      @zeros_at_mod[mod4] += 1
    else
      @nonzeros_at_mod[mod4] += 1
    end
    @position += 1
  end

  return get_state()
end

#get_charset_nameObject



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/rchardet/utf1632prober.rb', line 65

def get_charset_name
  if is_likely_utf32be
    return "UTF-32BE"
  end
  if is_likely_utf32le
    return "UTF-32LE"
  end
  if is_likely_utf16be
    return "UTF-16BE"
  end
  if is_likely_utf16le
    return "UTF-16LE"
  end
  # default to something valid
  return "UTF-16"
end

#get_confidenceObject



117
118
119
120
121
122
123
# File 'lib/rchardet/utf1632prober.rb', line 117

def get_confidence
  if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
    0.85
  else
    0.00
  end
end

#get_stateObject



102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/rchardet/utf1632prober.rb', line 102

def get_state
  if [ENotMe, EFoundIt].include? @state
    # terminal, decided states
    return @state
  end
  if get_confidence > 0.80
    @state = EFoundIt
  elsif @position > 4 * 1024
    # if we get to 4kb into the file, and we can't conclude it's UTF,
    # let's give up
    @state = ENotMe
  end
  return @state
end

#resetObject



50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/rchardet/utf1632prober.rb', line 50

def reset
  super()
  @position = 0
  @zeros_at_mod = [0, 0, 0, 0]
  @nonzeros_at_mod = [0, 0, 0, 0]
  @state = EDetecting
  @invalid_utf16be = false
  @invalid_utf16le = false
  @invalid_utf32be = false
  @invalid_utf32le = false
  @first_half_surrogate_pair_detected_16be = false
  @first_half_surrogate_pair_detected_16le = false
  @quad = [0, 0, 0, 0]
end