Class: CharDet::UTF8Prober

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/utf8prober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initializeUTF8Prober

Returns a new instance of UTF8Prober.



33
34
35
36
37
# File 'lib/rchardet/utf8prober.rb', line 33

def initialize
  super()
  @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/rchardet/utf8prober.rb', line 49

def feed(aBuf)
  for c in aBuf.split('')
  	codingState = @_mCodingSM.next_state(c)
  	if codingState == EError
  	  @_mState = ENotMe
  	  break
  	elsif codingState == EItsMe
  	  @_mState = EFoundIt
  	  break
  	elsif codingState == EStart
  	  if @_mCodingSM.get_current_charlen() >= 2
  	    @_mNumOfMBChar += 1
  	  end
  	end
  end

  if get_state == EDetecting
  	if get_confidence > SHORTCUT_THRESHOLD
  	  @_mState = EFoundIt
  	end
  end

  return get_state
end

#get_charset_nameObject



45
46
47
# File 'lib/rchardet/utf8prober.rb', line 45

def get_charset_name
  return "utf-8"
end

#get_confidenceObject



74
75
76
77
78
79
80
81
82
83
84
# File 'lib/rchardet/utf8prober.rb', line 74

def get_confidence
  unlike = 0.99
  if @_mNumOfMBChar < 6
  	for i in (0...@_mNumOfMBChar)
  	  unlike = unlike * ONE_CHAR_PROB
  	end
  	return 1.0 - unlike
        else
  	return unlike
  end
end

#resetObject



39
40
41
42
43
# File 'lib/rchardet/utf8prober.rb', line 39

def reset
  super()
  @_mCodingSM.reset()
  @_mNumOfMBChar = 0
end