Class: CharDet::HebrewProber

Inherits:

CharSetProber

Object
CharSetProber
CharDet::HebrewProber

show all

Defined in:: lib/rchardet/hebrewprober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

#feed(aBuf) ⇒ Object
#get_charset_name ⇒ Object
#get_state ⇒ Object
#initialize ⇒ HebrewProber constructor

A new instance of HebrewProber.
#is_final(c) ⇒ Object
#is_non_final(c) ⇒ Object
#reset ⇒ Object
#set_model_probers(logicalProber, visualProber) ⇒ Object

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence

Constructor Details

#initialize ⇒ `HebrewProber`

Returns a new instance of HebrewProber.

# File 'lib/rchardet/hebrewprober.rb', line 151

def initialize
  super()
  @logicalProber = nil
  @visualProber = nil
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 192

def feed(aBuf)
  # Final letter analysis for logical-visual decision.
  # Look for evidence that the received buffer is either logical Hebrew or 
  # visual Hebrew.
  # The following cases are checked:
  # 1) A word longer than 1 letter, ending with a final letter. This is an 
  #    indication that the text is laid out "naturally" since the final letter 
  #    really appears at the end. +1 for logical score.
  # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
  #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
  #    the Non-Final form of that letter. Exceptions to this rule are mentioned
  #    above in isNonFinal(). This is an indication that the text is laid out
  #    backwards. +1 for visual score
  # 3) A word longer than 1 letter, starting with a final letter. Final letters 
  #    should not appear at the beginning of a word. This is an indication that 
  #    the text is laid out backwards. +1 for visual score.
  # 
  # The visual score and logical score are accumulated throughout the text and 
  # are finally checked against each other in GetCharSetName().
  # No checking for final letters in the middle of words is done since that case
  # is not an indication for either Logical or Visual text.
  # 
  # We automatically filter out all 7-bit characters (replace them with spaces)
  # so the word boundary detection works properly. [MAP]

  if get_state() == ENotMe
    # Both model probers say it's not them. No reason to continue.
    return ENotMe
  end

  aBuf = filter_high_bit_only(aBuf)

  for cur in aBuf.split(' ')
    if cur == ' '
      # We stand on a space - a word just ended
      if @beforePrev != ' '
        # next-to-last char was not a space so self._mPrev is not a 1 letter word
        if is_final(@prev)
          # case (1) [-2:not space][-1:final letter][cur:space]
          @finalCharLogicalScore += 1
        elsif is_non_final(@prev)
          # case (2) [-2:not space][-1:Non-Final letter][cur:space]
          @finalCharVisualScore += 1
        end
      end
    else
      # Not standing on a space
      if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
        # case (3) [-2:space][-1:final letter][cur:not space]
        @finalCharVisualScore += 1
      end
    end
    @beforePrev = @prev
    @prev = cur
  end

  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
  return EDetecting
end

#get_charset_name ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 252

def get_charset_name
  # Make the decision: is it Logical or Visual?
  # If the final letter score distance is dominant enough, rely on it.
  finalsub = @finalCharLogicalScore - @finalCharVisualScore
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # It's not dominant enough, try to rely on the model scores instead.
  modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
  if modelsub > MIN_MODEL_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if modelsub < -MIN_MODEL_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # Still no good, back to final letter distance, maybe it'll save the day.
  if finalsub < 0.0
    return VISUAL_HEBREW_NAME
  end

  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
  return LOGICAL_HEBREW_NAME
end

#get_state ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 281

def get_state
  # Remain active as long as any of the model probers are active.
  if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
    return ENotMe
  end
  return EDetecting
end

#is_final(c) ⇒ `Object`



174
175
176

# File 'lib/rchardet/hebrewprober.rb', line 174

def is_final(c)
  return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end

#is_non_final(c) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 178

def is_non_final(c)
  # The normal Tsadi is not a good Non-Final letter due to words like 
  # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
  # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
  # the Non-Final tsadi to appear at an end of a word even though this is not 
  # the case in the original text.
  # The letters Pe and Kaf rarely display a related behavior of not being a 
  # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
  # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
  # these letters as Non-Final letters outweighs the damage since these words 
  # are quite rare.
  return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end

#reset ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 158

def reset
  @finalCharLogicalScore = 0
  @finalCharVisualScore = 0
  # The two last characters seen in the previous buffer,
  # mPrev and mBeforePrev are initialized to space in order to simulate a word 
  # delimiter at the beginning of the data
  @prev = ' '
  @beforePrev = ' '
  # These probers are owned by the group prober.
end

#set_model_probers(logicalProber, visualProber) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 169

def set_model_probers(logicalProber, visualProber)
  @logicalProber = logicalProber
  @visualProber = visualProber
end

Class: CharDet::HebrewProber

Instance Attribute Summary

Attributes inherited from CharSetProber

Instance Method Summary collapse

Methods inherited from CharSetProber

Constructor Details

#initialize ⇒ HebrewProber

Instance Method Details

#feed(aBuf) ⇒ Object

#get_charset_name ⇒ Object

#get_state ⇒ Object

#is_final(c) ⇒ Object

#is_non_final(c) ⇒ Object

#reset ⇒ Object

#set_model_probers(logicalProber, visualProber) ⇒ Object

#initialize ⇒ `HebrewProber`

#feed(aBuf) ⇒ `Object`

#get_charset_name ⇒ `Object`

#get_state ⇒ `Object`

#is_final(c) ⇒ `Object`

#is_non_final(c) ⇒ `Object`

#reset ⇒ `Object`

#set_model_probers(logicalProber, visualProber) ⇒ `Object`