Class: Langusta::Detector

Inherits:
Object
  • Object
show all
Defined in:
lib/langusta/detector.rb

Constant Summary collapse

ALPHA_DEFAULT =
0.5
ALPHA_WIDTH =
0.05
ITERATION_LIMIT =
1000
PROB_THRESHOLD =
0.1
CONV_THRESHOLD =
0.99999
BASE_FREQ =
10000
UNKNOWN_LANG =
"unknown"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(factory) ⇒ Detector

Returns a new instance of Detector.



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/langusta/detector.rb', line 13

def initialize(factory)
  @word_lang_prob_map = factory.word_lang_prob_map
  @lang_list = factory.lang_list
  @text = UCS2String.new('')
  @langprob = nil
  @alpha = ALPHA_DEFAULT
  @n_trial = 7
  @max_text_length = 10000
  @prior_map = nil
  @verbose = false
end

Instance Attribute Details

#alphaObject

Returns the value of attribute alpha.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def alpha
  @alpha
end

#max_text_lengthObject

Returns the value of attribute max_text_length.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def max_text_length
  @max_text_length
end

#verboseObject

Returns the value of attribute verbose.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def verbose
  @verbose
end

Instance Method Details

#append(text) ⇒ Object

Append more text to be recognized.

Parameters:

Raises:

  • (TypeError)


27
28
29
30
31
32
33
34
35
# File 'lib/langusta/detector.rb', line 27

def append(text)
  raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
  text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
  text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
  text = text.map do |c|
    NGram.normalize(c)
  end
  @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
end

#detectString

Detect the language.

Returns:

  • (String)

    (usually) two-letter code describing the language.



39
40
41
42
# File 'lib/langusta/detector.rb', line 39

def detect
  probabilities = get_probabilities()
  (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
end