Class: GIGO::CharDet::JapaneseContextAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/gigo/rchardet/jpcntx.rb

Direct Known Subclasses

EUCJPContextAnalysis, SJISContextAnalysis

Instance Method Summary collapse

Constructor Details

#initializeJapaneseContextAnalysis

Returns a new instance of JapaneseContextAnalysis.



124
125
126
# File 'lib/gigo/rchardet/jpcntx.rb', line 124

def initialize
  reset()
end

Instance Method Details

#feed(aBuf, aLen) ⇒ Object



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/gigo/rchardet/jpcntx.rb', line 136

def feed(aBuf, aLen)
  return if @_mDone

  # The buffer we got is byte oriented, and a character may span in more than one
  # buffers. In case the last one or two byte in last buffer is not complete, we 
  # record how many byte needed to complete that character and skip these bytes here.
  # We can choose to record those bytes as well and analyse the character once it 
  # is complete, but since a character will not make much difference, by simply skipping
  # this character will simply our logic and improve performance.
  i = @_mNeedToSkipCharNum
  while i < aLen
  	order, charLen = get_order(aBuf[i...i+2])
  	i += charLen
  	if i > aLen
  	  @_mNeedToSkipCharNum = i - aLen
  	  @_mLastCharOrder = -1
  	else
  	  if (order != -1) and (@_mLastCharOrder != -1)
  	    @_mTotalRel += 1
  	    if @_mTotalRel > MAX_REL_THRESHOLD
  	      @_mDone = true
  	      break
  	    end
  	    @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
  	  end
  	  @_mLastCharOrder = order
  	end
  end
end

#get_confidenceObject



170
171
172
173
174
175
176
177
# File 'lib/gigo/rchardet/jpcntx.rb', line 170

def get_confidence
  # This is just one way to calculate confidence. It works well for me.
  if @_mTotalRel > MINIMUM_DATA_THRESHOLD
   return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
  else
   return DONT_KNOW
  end
end

#get_order(aStr) ⇒ Object



179
180
181
# File 'lib/gigo/rchardet/jpcntx.rb', line 179

def get_order(aStr)
  return -1, 1
end

#got_enough_dataObject



166
167
168
# File 'lib/gigo/rchardet/jpcntx.rb', line 166

def got_enough_data
  return @_mTotalRel > ENOUGH_REL_THRESHOLD
end

#resetObject



128
129
130
131
132
133
134
# File 'lib/gigo/rchardet/jpcntx.rb', line 128

def reset
  @_mTotalRel = 0 # total sequence received
  @_mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
  @_mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
  @_mLastCharOrder = -1 # The order of previous char
  @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
end