Class: CharDet::HebrewProber

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/hebrewprober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence

Constructor Details

#initializeHebrewProber

Returns a new instance of HebrewProber.



151
152
153
154
155
156
# File 'lib/rchardet/hebrewprober.rb', line 151

def initialize
  super()
  @logicalProber = nil
  @visualProber = nil
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/rchardet/hebrewprober.rb', line 192

def feed(aBuf)
  # Final letter analysis for logical-visual decision.
  # Look for evidence that the received buffer is either logical Hebrew or 
  # visual Hebrew.
  # The following cases are checked:
  # 1) A word longer than 1 letter, ending with a final letter. This is an 
  #    indication that the text is laid out "naturally" since the final letter 
  #    really appears at the end. +1 for logical score.
  # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
  #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
  #    the Non-Final form of that letter. Exceptions to this rule are mentioned
  #    above in isNonFinal(). This is an indication that the text is laid out
  #    backwards. +1 for visual score
  # 3) A word longer than 1 letter, starting with a final letter. Final letters 
  #    should not appear at the beginning of a word. This is an indication that 
  #    the text is laid out backwards. +1 for visual score.
  # 
  # The visual score and logical score are accumulated throughout the text and 
  # are finally checked against each other in GetCharSetName().
  # No checking for final letters in the middle of words is done since that case
  # is not an indication for either Logical or Visual text.
  # 
  # We automatically filter out all 7-bit characters (replace them with spaces)
  # so the word boundary detection works properly. [MAP]

  if get_state() == ENotMe
    # Both model probers say it's not them. No reason to continue.
    return ENotMe
  end

  aBuf = filter_high_bit_only(aBuf)

  for cur in aBuf.split(' ')
    if cur == ' '
      # We stand on a space - a word just ended
      if @beforePrev != ' '
        # next-to-last char was not a space so self._mPrev is not a 1 letter word
        if is_final(@prev)
          # case (1) [-2:not space][-1:final letter][cur:space]
          @finalCharLogicalScore += 1
        elsif is_non_final(@prev)
          # case (2) [-2:not space][-1:Non-Final letter][cur:space]
          @finalCharVisualScore += 1
        end
      end
    else
      # Not standing on a space
      if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
        # case (3) [-2:space][-1:final letter][cur:not space]
        @finalCharVisualScore += 1
      end
    end
    @beforePrev = @prev
    @prev = cur
  end

  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
  return EDetecting
end

#get_charset_nameObject



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/rchardet/hebrewprober.rb', line 252

def get_charset_name
  # Make the decision: is it Logical or Visual?
  # If the final letter score distance is dominant enough, rely on it.
  finalsub = @finalCharLogicalScore - @finalCharVisualScore
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # It's not dominant enough, try to rely on the model scores instead.
  modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
  if modelsub > MIN_MODEL_DISTANCE
    return LOGICAL_HEBREW_NAME
  end
  if modelsub < -MIN_MODEL_DISTANCE
    return VISUAL_HEBREW_NAME
  end

  # Still no good, back to final letter distance, maybe it'll save the day.
  if finalsub < 0.0
    return VISUAL_HEBREW_NAME
  end

  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
  return LOGICAL_HEBREW_NAME
end

#get_stateObject



281
282
283
284
285
286
287
# File 'lib/rchardet/hebrewprober.rb', line 281

def get_state
  # Remain active as long as any of the model probers are active.
  if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
    return ENotMe
  end
  return EDetecting
end

#is_final(c) ⇒ Object



174
175
176
# File 'lib/rchardet/hebrewprober.rb', line 174

def is_final(c)
  return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end

#is_non_final(c) ⇒ Object



178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/rchardet/hebrewprober.rb', line 178

def is_non_final(c)
  # The normal Tsadi is not a good Non-Final letter due to words like 
  # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
  # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
  # the Non-Final tsadi to appear at an end of a word even though this is not 
  # the case in the original text.
  # The letters Pe and Kaf rarely display a related behavior of not being a 
  # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
  # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
  # these letters as Non-Final letters outweighs the damage since these words 
  # are quite rare.
  return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end

#resetObject



158
159
160
161
162
163
164
165
166
167
# File 'lib/rchardet/hebrewprober.rb', line 158

def reset
  @finalCharLogicalScore = 0
  @finalCharVisualScore = 0
  # The two last characters seen in the previous buffer,
  # mPrev and mBeforePrev are initialized to space in order to simulate a word 
  # delimiter at the beginning of the data
  @prev = ' '
  @beforePrev = ' '
  # These probers are owned by the group prober.
end

#set_model_probers(logicalProber, visualProber) ⇒ Object



169
170
171
172
# File 'lib/rchardet/hebrewprober.rb', line 169

def set_model_probers(logicalProber, visualProber)
  @logicalProber = logicalProber
  @visualProber = visualProber
end