Class: LanguageDetector::Profile

Inherits:
Object
  • Object
show all
Defined in:
lib/language_detector.rb

Constant Summary collapse

PUNCTUATIONS =
[?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
LIMIT =
2000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name) ⇒ Profile

Returns a new instance of Profile.



140
141
142
143
144
145
# File 'lib/language_detector.rb', line 140

def initialize(name)
  @name = name
  @puctuations = {}
  PUNCTUATIONS.each {|p| @puctuations[p] = 1}
  @ngrams = {}
end

Instance Attribute Details

#nameObject (readonly)

Returns the value of attribute name.



138
139
140
# File 'lib/language_detector.rb', line 138

def name
  @name
end

#ngramsObject (readonly)

Returns the value of attribute ngrams.



138
139
140
# File 'lib/language_detector.rb', line 138

def ngrams
  @ngrams
end

Instance Method Details

#_init_with_string(str, ngram_count) ⇒ Object



179
180
181
182
183
184
185
186
187
# File 'lib/language_detector.rb', line 179

def _init_with_string str, ngram_count
  tokens = tokenize(str)
  tokens.each {|token|
    count_ngram token, 2, ngram_count
    count_ngram token, 3, ngram_count
    count_ngram token, 4, ngram_count
    count_ngram token, 5, ngram_count
  }
end

#compute_distance(other_profile) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/language_detector.rb', line 125

def compute_distance other_profile
  distance = 0
  other_profile.ngrams.each {|k, v|
    n = @ngrams[k]
    if n
      distance += (v - n).abs
    else
      distance += LanguageDetector::Profile::LIMIT
    end
  }
  return distance
end

#count_ngram(token, n, counts) ⇒ Object



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/language_detector.rb', line 208

def count_ngram token, n, counts
  token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
  i = 0
  while i + n <= token.length
    s = ''
    j = 0
    while j < n
      s << token[i+j]
      j += 1
    end
    if counts[s]
      counts[s] = counts[s] + 1
    else
      counts[s] = 1
    end
    i += 1
  end

  return counts
end

#init_with_file(filename) ⇒ Object



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/language_detector.rb', line 147

def init_with_file filename
  ngram_count = {}

  path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
  puts "training with " + path
  File.open(path).each_line{ |line|
    _init_with_string line, ngram_count
  }

  a = ngram_count.sort {|a,b| b[1] <=> a[1]}
  i = 1
  a.each {|t|
    @ngrams[t[0]] = i
    i += 1
    break if i > LIMIT
  }
end

#init_with_string(str) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/language_detector.rb', line 165

def init_with_string str
  ngram_count = {}

  _init_with_string str, ngram_count

  a = ngram_count.sort {|a,b| b[1] <=> a[1]}
  i = 1
  a.each {|t|
    @ngrams[t[0]] = i
    i += 1
    break if i > LIMIT
  }
end

#is_puctuation?(b) ⇒ Boolean

Returns:

  • (Boolean)


204
205
206
# File 'lib/language_detector.rb', line 204

def is_puctuation? b
  @puctuations[b]
end

#tokenize(str) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/language_detector.rb', line 189

def tokenize str
  tokens = []
  s = ''
  str.each_byte {|b|
    if is_puctuation?(b)
      tokens << s unless s.empty?
      s = ''
    else
      s << b
    end
  }
  tokens << s unless s.empty?
  return tokens
end