Class: UniversalDetector::HebrewProber

Inherits:

CharSetProber

Object
CharSetProber
UniversalDetector::HebrewProber

show all

Defined in:: lib/HebrewProber.rb

Instance Method Summary collapse

#feed(aBuf) ⇒ Object
#get_charset_name ⇒ Object
#get_state ⇒ Object
#initialize ⇒ HebrewProber constructor

A new instance of HebrewProber.
#is_final(c) ⇒ Object
#is_non_final(c) ⇒ Object
#reset ⇒ Object
#set_model_probers(logicalProber, visualProber) ⇒ Object

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence

Constructor Details

#initialize ⇒ `HebrewProber`

Returns a new instance of HebrewProber.

# File 'lib/HebrewProber.rb', line 154

def initialize
    super
    @_mLogicalProber = nil
    @_mVisualProber = nil
    reset()
end

Instance Method Details

#feed(aBuf) ⇒ `Object`

# File 'lib/HebrewProber.rb', line 195

def feed(aBuf)
    # Final letter analysis for logical-visual decision.
    # Look for evidence that the received buffer is either logical Hebrew or 
    # visual Hebrew.
    # The following cases are checked:
    # 1) A word longer than 1 letter, ending with a final letter. This is an 
    #    indication that the text is laid out "naturally" since the final letter 
    #    really appears at the end. +1 for logical score.
    # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
    #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
    #    the Non-Final form of that letter. Exceptions to this rule are mentioned
    #    above in isNonFinal(). This is an indication that the text is laid out
    #    backwards. +1 for visual score
    # 3) A word longer than 1 letter, starting with a final letter. Final letters 
    #    should not appear at the beginning of a word. This is an indication that 
    #    the text is laid out backwards. +1 for visual score.
    # 
    # The visual score and logical score are accumulated throughout the text and 
    # are finally checked against each other in GetCharSetName().
    # No checking for final letters in the middle of words is done since that case
    # is not an indication for either Logical or Visual text.
    # 
    # We automatically filter out all 7-bit characters (replace them with spaces)
    # so the word boundary detection works properly. [MAP]

    if get_state() == :NotMe
        # Both model probers say it's not them. No reason to continue.
        return :NotMe
    end

    aBuf = filter_high_bit_only(aBuf)

    for cur in aBuf
        if cur == ' '
            # We stand on a space - a word just ended
            if @_mBeforePrev != ' '
                # next-to-last char was not a space so @_mPrev is not a 1 letter word
                if is_final(@_mPrev)
                    # case (1) [-2:not space][-1:final letter][cur:space]
                    @_mFinalCharLogicalScore += 1
                elsif is_non_final(@_mPrev)
                    # case (2) [-2:not space][-1:Non-Final letter][cur:space]
                    @_mFinalCharVisualScore += 1
                end
            end
        else
            # Not standing on a space
            if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
                # case (3) [-2:space][-1:final letter][cur:not space]
                @_mFinalCharVisualScore += 1
            end
        end
        @_mBeforePrev = @_mPrev
        @_mPrev = cur
    end

    # Forever detecting, till the end or until both model probers return eNotMe (handled above)
    return :Detecting
end

#get_charset_name ⇒ `Object`

# File 'lib/HebrewProber.rb', line 255

def get_charset_name
    # Make the decision: is it Logical or Visual?
    # If the final letter score distance is dominant enough, rely on it.
    finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
    if finalsub >= MIN_FINAL_CHAR_DISTANCE
        return LOGICAL_HEBREW_NAME
    end
    if finalsub <= -MIN_FINAL_CHAR_DISTANCE
        return VISUAL_HEBREW_NAME
    end

    # It's not dominant enough, try to rely on the model scores instead.
    modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
    if modelsub > MIN_MODEL_DISTANCE
        return LOGICAL_HEBREW_NAME
    end
    if modelsub < -MIN_MODEL_DISTANCE
        return VISUAL_HEBREW_NAME
    end

    # Still no good, back to final letter distance, maybe it'll save the day.
    if finalsub < 0.0
        return VISUAL_HEBREW_NAME
    end

    # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
    return LOGICAL_HEBREW_NAME
end

#get_state ⇒ `Object`

# File 'lib/HebrewProber.rb', line 284

def get_state
    # Remain active as long as any of the model probers are active.
    if (@_mLogicalProber.get_state() == :NotMe) and (@_mVisualProber.get_state() == :NotMe)
        return :NotMe
    end
    return :Detecting
end

#is_final(c) ⇒ `Object`



177
178
179

# File 'lib/HebrewProber.rb', line 177

def is_final(c)
    return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end

#is_non_final(c) ⇒ `Object`

# File 'lib/HebrewProber.rb', line 181

def is_non_final(c)
    # The normal Tsadi is not a good Non-Final letter due to words like 
    # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
    # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
    # the Non-Final tsadi to appear at an end of a word even though this is not 
    # the case in the original text.
    # The letters Pe and Kaf rarely display a related behavior of not being a 
    # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
    # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
    # these letters as Non-Final letters outweighs the damage since these words 
    # are quite rare.
    return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end

#reset ⇒ `Object`

# File 'lib/HebrewProber.rb', line 161

def reset
    @_mFinalCharLogicalScore = 0
    @_mFinalCharVisualScore = 0
    # The two last characters seen in the previous buffer,
    # mPrev and mBeforePrev are initialized to space in order to simulate a word 
    # delimiter at the beginning of the data
    @_mPrev = ' '
    @_mBeforePrev = ' '
    # These probers are owned by the group prober.
end

#set_model_probers(logicalProber, visualProber) ⇒ `Object`

# File 'lib/HebrewProber.rb', line 172

def set_model_probers(logicalProber, visualProber)
    @_mLogicalProber = logicalProber
    @_mVisualProber = visualProber
end

Class: UniversalDetector::HebrewProber

Instance Method Summary collapse

Methods inherited from CharSetProber

Constructor Details

#initialize ⇒ HebrewProber

Instance Method Details

#feed(aBuf) ⇒ Object

#get_charset_name ⇒ Object

#get_state ⇒ Object

#is_final(c) ⇒ Object

#is_non_final(c) ⇒ Object

#reset ⇒ Object

#set_model_probers(logicalProber, visualProber) ⇒ Object

#initialize ⇒ `HebrewProber`

#feed(aBuf) ⇒ `Object`

#get_charset_name ⇒ `Object`

#get_state ⇒ `Object`

#is_final(c) ⇒ `Object`

#is_non_final(c) ⇒ `Object`

#reset ⇒ `Object`

#set_model_probers(logicalProber, visualProber) ⇒ `Object`