Class: WhatLanguage

Inherits:
Object
  • Object
show all
Defined in:
lib/whatlanguage.rb

Constant Summary collapse

VERSION =
'1.0.3'
HASHER =
lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
BITFIELD_WIDTH =
2_000_000
@@data =
{}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ WhatLanguage

Returns a new instance of WhatLanguage.



13
14
15
16
17
18
# File 'lib/whatlanguage.rb', line 13

def initialize(options = {})
  languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
  Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
    @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
  end
end

Class Method Details

.filter_from_dictionary(filename) ⇒ Object



48
49
50
51
52
# File 'lib/whatlanguage.rb', line 48

def self.filter_from_dictionary(filename)
  bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
  File.open(filename).each { |word| bf.add(word) }
  bf
end

Instance Method Details

#language(text) ⇒ Object



44
45
46
# File 'lib/whatlanguage.rb', line 44

def language(text)
  process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
end

#process_text(text) ⇒ Object

Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/whatlanguage.rb', line 22

def process_text(text)
  results = Hash.new(0)
  it = 0
  text.split.collect {|a| a.downcase }.each do |word|
    it += 1
    @@data.keys.each do |lang|
      results[lang] += 1 if @@data[lang].includes?(word)
    end
    
    # Every now and then check to see if we have a really convincing result.. if so, exit early.
    if it % 4 == 0 && results.size > 1
      top_results = results.sort_by{|a,b| b}.reverse[0..1]
      
      # Next line may need some tweaking one day..
      break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
    end
    
    #break if it > 100
  end
  results
end