Class: GIGO::CharDet::MultiByteCharSetProber
Instance Attribute Summary
#active
Instance Method Summary
collapse
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state
Constructor Details
Returns a new instance of MultiByteCharSetProber.
34
35
36
37
38
39
|
# File 'lib/gigo/rchardet/mbcharsetprober.rb', line 34
def initialize
super
@_mDistributionAnalyzer = nil
@_mCodingSM = nil
@_mLastChar = "\x00\x00"
end
|
Instance Method Details
#feed(aBuf) ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
# File 'lib/gigo/rchardet/mbcharsetprober.rb', line 55
def feed(aBuf)
aLen = aBuf.length
for i in (0...aLen)
codingState = @_mCodingSM.next_state(aBuf[i..i])
if codingState == EError
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
@_mState = ENotMe
break
elsif codingState == EItsMe
@_mState = EFoundIt
break
elsif codingState == EStart
charLen = @_mCodingSM.get_current_charlen()
if i == 0
@_mLastChar[1] = aBuf[0..0]
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
else
@_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
end
end
end
@_mLastChar[0] = aBuf[aLen-1..aLen-1]
if get_state() == EDetecting
if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
@_mState = EFoundIt
end
end
return get_state()
end
|
#get_charset_name ⇒ Object
52
53
|
# File 'lib/gigo/rchardet/mbcharsetprober.rb', line 52
def get_charset_name
end
|
#get_confidence ⇒ Object
86
87
88
|
# File 'lib/gigo/rchardet/mbcharsetprober.rb', line 86
def get_confidence
return @_mDistributionAnalyzer.get_confidence()
end
|
#reset ⇒ Object
41
42
43
44
45
46
47
48
49
50
|
# File 'lib/gigo/rchardet/mbcharsetprober.rb', line 41
def reset
super
if @_mCodingSM
@_mCodingSM.reset()
end
if @_mDistributionAnalyzer
@_mDistributionAnalyzer.reset()
end
@_mLastChar = "\x00\x00"
end
|