Class: WhatLanguage
- Inherits:
-
Object
- Object
- WhatLanguage
- Defined in:
- lib/whatlanguage.rb,
lib/whatlanguage/version.rb
Constant Summary collapse
- HASHER =
lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
- BITFIELD_WIDTH =
2_000_000- VERSION =
'1.0.5'- @@data =
{}
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(*selection) ⇒ WhatLanguage
constructor
A new instance of WhatLanguage.
- #language(text) ⇒ Object
-
#process_text(text) ⇒ Object
Very inefficient method for now..
Constructor Details
#initialize(*selection) ⇒ WhatLanguage
Returns a new instance of WhatLanguage.
12 13 14 15 16 17 18 |
# File 'lib/whatlanguage.rb', line 12 def initialize(*selection) @selection = (selection.empty?) ? [:all] : selection languages_folder = File.join(File.dirname(__FILE__), "..", "lang") Dir.entries(languages_folder).grep(/\.lang/).each do |lang| @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER) end end |
Class Method Details
.filter_from_dictionary(filename) ⇒ Object
55 56 57 58 59 |
# File 'lib/whatlanguage.rb', line 55 def self.filter_from_dictionary(filename) bf = BloominSimple.new(BITFIELD_WIDTH, &HASHER) File.open(filename).each { |word| bf.add(word) } bf end |
Instance Method Details
#language(text) ⇒ Object
51 52 53 |
# File 'lib/whatlanguage.rb', line 51 def language(text) process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil end |
#process_text(text) ⇒ Object
Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/whatlanguage.rb', line 22 def process_text(text) results = Hash.new(0) it = 0 text.downcase.split.each do |word| it += 1 if @selection.include?(:all) languages = @@data.keys else languages = @@data.keys & @selection # intersection end languages.each do |lang| results[lang] += 1 if @@data[lang].includes?(word) end # Every now and then check to see if we have a really convincing result.. if so, exit early. if it % 4 == 0 && results.size > 1 top_results = results.sort_by{|a,b| -b}[0..1] # Next line may need some tweaking one day.. break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25)) end #break if it > 100 end results end |