Class: LanguageDetector::Profile

Inherits:
Object
  • Object
show all
Defined in:
lib/language_detector.rb

Constant Summary collapse

LIMIT =
1500
PUNCTUATION_REGEX =
/[\W^_\d]+/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Profile

Returns a new instance of Profile.



150
151
152
153
154
155
156
157
158
# File 'lib/language_detector.rb', line 150

def initialize(*args)
  args = args.first

  @name = args[:name] || ""
  @ngrams = {}

  init_with_string(args[:text]) if args[:text]
  init_with_file(args[:file]) if args[:file]
end

Instance Attribute Details

#nameObject

Returns the value of attribute name.



148
149
150
# File 'lib/language_detector.rb', line 148

def name
  @name
end

#ngramsObject

Returns the value of attribute ngrams.



148
149
150
# File 'lib/language_detector.rb', line 148

def ngrams
  @ngrams
end

Instance Method Details

#compute_distance(other_profile) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/language_detector.rb', line 160

def compute_distance(other_profile)
  distance = 0
  other_profile.ngrams.each do |k, v|
    n = @ngrams[k]
    if n = @ngrams[k]
      distance += (v - n).abs
    else
      distance += LIMIT
    end
  end

  distance
end

#count_ngram(token, n, counts) ⇒ Object



207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/language_detector.rb', line 207

def count_ngram(token, n, counts)
  token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.length >= n
  
  n.upto(token.length).with_index do |t, i|
    s = ''

    0.upto(n-1) { |j| s << token[i+j] }
    counts[s] = counts.has_key?(s) ? counts[s]+=1 : 1
  end

  counts
end

#generate_ngrams(str, ngram_count) ⇒ Object



197
198
199
200
201
202
# File 'lib/language_detector.rb', line 197

def generate_ngrams(str, ngram_count)
  tokens = tokenize(str)
  tokens.each do |token|
    2.upto(5) { |n| count_ngram(token, n, ngram_count) }
  end
end

#init_with_file(filename) ⇒ Object



174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/language_detector.rb', line 174

def init_with_file(filename)
  ngram_count = Hash.new(0)

  path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
  File.open(path).each_line {|line| generate_ngrams(line, ngram_count) }
  puts "training with " + path

  ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i|
    ngrams[t[0]] = (i+1)
    break if i > LIMIT
  end
end

#init_with_string(str) ⇒ Object



187
188
189
190
191
192
193
194
195
# File 'lib/language_detector.rb', line 187

def init_with_string(str)
  ngram_count = {}
  generate_ngrams(str, ngram_count)

  ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i|
    @ngrams[t[0]] = (i+1)
    break if i > LIMIT
  end
end

#is_punctuation?(char) ⇒ Boolean

Returns:

  • (Boolean)


205
# File 'lib/language_detector.rb', line 205

def is_punctuation?(char); char =~ PUNCTUATION_REGEX; end

#tokenize(str) ⇒ Object



204
# File 'lib/language_detector.rb', line 204

def tokenize(str) str.split(PUNCTUATION_REGEX); end