Class: UniversalDetector::HebrewProber

Inherits:
CharSetProber show all
Defined in:
lib/HebrewProber.rb

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence

Constructor Details

#initializeHebrewProber

Returns a new instance of HebrewProber.



154
155
156
157
158
159
# File 'lib/HebrewProber.rb', line 154

def initialize
    super
    @_mLogicalProber = nil
    @_mVisualProber = nil
    reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# File 'lib/HebrewProber.rb', line 195

def feed(aBuf)
    # Final letter analysis for logical-visual decision.
    # Look for evidence that the received buffer is either logical Hebrew or 
    # visual Hebrew.
    # The following cases are checked:
    # 1) A word longer than 1 letter, ending with a final letter. This is an 
    #    indication that the text is laid out "naturally" since the final letter 
    #    really appears at the end. +1 for logical score.
    # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
    #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
    #    the Non-Final form of that letter. Exceptions to this rule are mentioned
    #    above in isNonFinal(). This is an indication that the text is laid out
    #    backwards. +1 for visual score
    # 3) A word longer than 1 letter, starting with a final letter. Final letters 
    #    should not appear at the beginning of a word. This is an indication that 
    #    the text is laid out backwards. +1 for visual score.
    # 
    # The visual score and logical score are accumulated throughout the text and 
    # are finally checked against each other in GetCharSetName().
    # No checking for final letters in the middle of words is done since that case
    # is not an indication for either Logical or Visual text.
    # 
    # We automatically filter out all 7-bit characters (replace them with spaces)
    # so the word boundary detection works properly. [MAP]

    if get_state() == :NotMe
        # Both model probers say it's not them. No reason to continue.
        return :NotMe
    end

    aBuf = filter_high_bit_only(aBuf)

    for cur in aBuf
        if cur == ' '
            # We stand on a space - a word just ended
            if @_mBeforePrev != ' '
                # next-to-last char was not a space so @_mPrev is not a 1 letter word
                if is_final(@_mPrev)
                    # case (1) [-2:not space][-1:final letter][cur:space]
                    @_mFinalCharLogicalScore += 1
                elsif is_non_final(@_mPrev)
                    # case (2) [-2:not space][-1:Non-Final letter][cur:space]
                    @_mFinalCharVisualScore += 1
                end
            end
        else
            # Not standing on a space
            if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
                # case (3) [-2:space][-1:final letter][cur:not space]
                @_mFinalCharVisualScore += 1
            end
        end
        @_mBeforePrev = @_mPrev
        @_mPrev = cur
    end

    # Forever detecting, till the end or until both model probers return eNotMe (handled above)
    return :Detecting
end

#get_charset_nameObject



255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/HebrewProber.rb', line 255

def get_charset_name
    # Make the decision: is it Logical or Visual?
    # If the final letter score distance is dominant enough, rely on it.
    finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
    if finalsub >= MIN_FINAL_CHAR_DISTANCE
        return LOGICAL_HEBREW_NAME
    end
    if finalsub <= -MIN_FINAL_CHAR_DISTANCE
        return VISUAL_HEBREW_NAME
    end

    # It's not dominant enough, try to rely on the model scores instead.
    modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
    if modelsub > MIN_MODEL_DISTANCE
        return LOGICAL_HEBREW_NAME
    end
    if modelsub < -MIN_MODEL_DISTANCE
        return VISUAL_HEBREW_NAME
    end

    # Still no good, back to final letter distance, maybe it'll save the day.
    if finalsub < 0.0
        return VISUAL_HEBREW_NAME
    end

    # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
    return LOGICAL_HEBREW_NAME
end

#get_stateObject



284
285
286
287
288
289
290
# File 'lib/HebrewProber.rb', line 284

def get_state
    # Remain active as long as any of the model probers are active.
    if (@_mLogicalProber.get_state() == :NotMe) and (@_mVisualProber.get_state() == :NotMe)
        return :NotMe
    end
    return :Detecting
end

#is_final(c) ⇒ Object



177
178
179
# File 'lib/HebrewProber.rb', line 177

def is_final(c)
    return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end

#is_non_final(c) ⇒ Object



181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/HebrewProber.rb', line 181

def is_non_final(c)
    # The normal Tsadi is not a good Non-Final letter due to words like 
    # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
    # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
    # the Non-Final tsadi to appear at an end of a word even though this is not 
    # the case in the original text.
    # The letters Pe and Kaf rarely display a related behavior of not being a 
    # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
    # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
    # these letters as Non-Final letters outweighs the damage since these words 
    # are quite rare.
    return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end

#resetObject



161
162
163
164
165
166
167
168
169
170
# File 'lib/HebrewProber.rb', line 161

def reset
    @_mFinalCharLogicalScore = 0
    @_mFinalCharVisualScore = 0
    # The two last characters seen in the previous buffer,
    # mPrev and mBeforePrev are initialized to space in order to simulate a word 
    # delimiter at the beginning of the data
    @_mPrev = ' '
    @_mBeforePrev = ' '
    # These probers are owned by the group prober.
end

#set_model_probers(logicalProber, visualProber) ⇒ Object



172
173
174
175
# File 'lib/HebrewProber.rb', line 172

def set_model_probers(logicalProber, visualProber)
    @_mLogicalProber = logicalProber
    @_mVisualProber = visualProber
end