Class: CharDet::SingleByteCharSetProber

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/sbcharsetprober.rb

Overview

NEGATIVE_CAT = 0

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initialize(model, reversed = false, nameProber = nil) ⇒ SingleByteCharSetProber

Returns a new instance of SingleByteCharSetProber.



41
42
43
44
45
46
47
# File 'lib/rchardet/sbcharsetprober.rb', line 41

def initialize(model, reversed=false, nameProber=nil)
  super()
  @model = model
  @reversed = reversed # TRUE if we need to reverse every pair in the model lookup
  @nameProber = nameProber # Optional auxiliary prober for name decision
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/rchardet/sbcharsetprober.rb', line 66

def feed(aBuf)
  if !@model['keepEnglishLetter']
    aBuf = filter_without_english_letters(aBuf)
  end
  aLen = aBuf.length
  if aLen == 0
    return get_state()
  end
  aBuf.each_byte do |b|
    c = b.chr
    order = @model['charToOrderMap'][c.bytes.first]
    if order < SYMBOL_CAT_ORDER
      @totalChar += 1
    end
    if order < SAMPLE_SIZE
      @freqChar += 1
      if @lastOrder < SAMPLE_SIZE
        @totalSeqs += 1
        if !@reversed
          @seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
        else # reverse the order of the letters in the lookup
          @seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
        end
      end
    end
    @lastOrder = order
  end

  if get_state() == EDetecting
    if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
      cf = get_confidence()
      if cf > POSITIVE_SHORTCUT_THRESHOLD
        $stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
        @state = EFoundIt
      elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
        $stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
        @state = ENotMe
      end
    end
  end

  return get_state()
end

#get_charset_nameObject



58
59
60
61
62
63
64
# File 'lib/rchardet/sbcharsetprober.rb', line 58

def get_charset_name
  if @nameProber
    return @nameProber.get_charset_name()
  else
    return @model['charsetName']
  end
end

#get_confidenceObject



110
111
112
113
114
115
116
117
118
119
120
# File 'lib/rchardet/sbcharsetprober.rb', line 110

def get_confidence
  r = 0.01
  if @totalSeqs > 0
    r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
    r = r * @freqChar / @totalChar
    if r >= 1.0
      r = 0.99
    end
  end
  return r
end

#resetObject



49
50
51
52
53
54
55
56
# File 'lib/rchardet/sbcharsetprober.rb', line 49

def reset
  super()
  @lastOrder = 255 # char order of last character
  @seqCounters = [0] * NUMBER_OF_SEQ_CAT
  @totalSeqs = 0
  @totalChar = 0
  @freqChar = 0 # characters that fall in our sampling range
end