Method: UniversalDetector::SingleByteCharSetProber#feed

Defined in:
lib/SingleByteCharSetProber.rb

#feed(aBuf) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/SingleByteCharSetProber.rb', line 68

def feed(aBuf)
    unless @_mModel['keepEnglishLetter']
        aBuf = filter_without_english_letters(aBuf)
    end
    aLen = aBuf.length
    unless aLen
        return get_state()
    end                        
    
    for i in 0...aLen
        c = aBuf[i]
        order = @_mModel['charToOrderMap'][c]
        if order < SYMBOL_CAT_ORDER
            @_mTotalChar += 1
        end
        if order < SAMPLE_SIZE                    
            @_mFreqChar += 1
            if @_mLastOrder < SAMPLE_SIZE
                @_mTotalSeqs += 1
                unless @_mReversed
                    @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1                        
                else # reverse the order of the letters in the lookup
                    @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
                end
            end
        end
        @_mLastOrder = order
    end

    if get_state() == :Detecting
        if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
            cf = get_confidence()
            if cf > POSITIVE_SHORTCUT_THRESHOLD
                if DEBUG
                    p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
                end
                @_mState = :FoundIt
            elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
                if DEBUG
                    p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
                end
                @_mState = :NotMe
            end
        end
    end

    return get_state()
end