Class: CharDet::MultiByteCharSetProber

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/mbcharsetprober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initializeMultiByteCharSetProber

Returns a new instance of MultiByteCharSetProber.



33
34
35
36
37
38
# File 'lib/rchardet/mbcharsetprober.rb', line 33

def initialize
  super
  @distributionAnalyzer = nil
  @codingSM = nil
  @lastChar = "\x00\x00"
end

Instance Method Details

#feed(aBuf) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/rchardet/mbcharsetprober.rb', line 54

def feed(aBuf)
  aLen = aBuf.length
  for i in (0...aLen)
    codingState = @codingSM.next_state(aBuf[i, 1])
    if codingState == EError
      $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
      @state = ENotMe
      break
    elsif codingState == EItsMe
      @state = EFoundIt
      break
    elsif codingState == EStart
      charLen = @codingSM.get_current_charlen()
      if i == 0
        @lastChar[1] = aBuf[0, 1]
        @distributionAnalyzer.feed(@lastChar, charLen)
      else
        @distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
      end
    end
  end
  @lastChar[0] = aBuf[aLen-1, 1]

  if get_state() == EDetecting
    if @distributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
      @state = EFoundIt
    end
  end
  return get_state()
end

#get_charset_nameObject



51
52
# File 'lib/rchardet/mbcharsetprober.rb', line 51

def get_charset_name
end

#get_confidenceObject



85
86
87
# File 'lib/rchardet/mbcharsetprober.rb', line 85

def get_confidence
  return @distributionAnalyzer.get_confidence()
end

#resetObject



40
41
42
43
44
45
46
47
48
49
# File 'lib/rchardet/mbcharsetprober.rb', line 40

def reset
  super
  if @codingSM
    @codingSM.reset()
  end
  if @distributionAnalyzer
    @distributionAnalyzer.reset()
  end
  @lastChar = "\x00\x00"
end