Class: WhatLanguage

Inherits:
Object
  • Object
show all
Defined in:
lib/whatlanguage.rb,
lib/whatlanguage/version.rb

Constant Summary collapse

HASHER =
lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
BITFIELD_WIDTH =
2_000_000
VERSION =
'1.0.5'
@@data =
{}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*selection) ⇒ WhatLanguage

Returns a new instance of WhatLanguage.



12
13
14
15
16
17
18
# File 'lib/whatlanguage.rb', line 12

def initialize(*selection)
  @selection = (selection.empty?) ? [:all] : selection
  languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
  Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
    @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
  end
end

Class Method Details

.filter_from_dictionary(filename) ⇒ Object



55
56
57
58
59
# File 'lib/whatlanguage.rb', line 55

def self.filter_from_dictionary(filename)
  bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
  File.open(filename).each { |word| bf.add(word) }
  bf
end

Instance Method Details

#language(text) ⇒ Object



51
52
53
# File 'lib/whatlanguage.rb', line 51

def language(text)
  process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
end

#process_text(text) ⇒ Object

Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/whatlanguage.rb', line 22

def process_text(text)
  results = Hash.new(0)
  it = 0
  text.downcase.split.each do |word|
    it += 1

    if @selection.include?(:all)
      languages = @@data.keys
    else
      languages = @@data.keys & @selection  # intersection
    end

    languages.each do |lang|
      results[lang] += 1 if @@data[lang].includes?(word)
    end
    
    # Every now and then check to see if we have a really convincing result.. if so, exit early.
    if it % 4 == 0 && results.size > 1
      top_results = results.sort_by{|a,b| -b}[0..1]
      
      # Next line may need some tweaking one day..
      break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
    end
    
    #break if it > 100
  end
  results
end