Class: Langusta::Detector

Inherits:
Object
  • Object
show all
Defined in:
lib/langusta/detector.rb

Constant Summary collapse

ALPHA_DEFAULT =
0.5
ALPHA_WIDTH =
0.05
ITERATION_LIMIT =
1000
PROB_THRESHOLD =
0.1
CONV_THRESHOLD =
0.99999
BASE_FREQ =
10000
UNKNOWN_LANG =
"unknown"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(factory) ⇒ Detector

Returns a new instance of Detector.



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/langusta/detector.rb', line 13

def initialize(factory)
  @word_lang_prob_map = factory.word_lang_prob_map
  @lang_list = factory.lang_list
  @text = []
  @langprob = nil
  @alpha = ALPHA_DEFAULT
  @n_trial = 7
  @max_text_length = 10000
  @prior_map = nil
  @verbose = false
end

Instance Attribute Details

#alphaObject

Returns the value of attribute alpha.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def alpha
  @alpha
end

#max_text_lengthObject

Returns the value of attribute max_text_length.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def max_text_length
  @max_text_length
end

#verboseObject

Returns the value of attribute verbose.



3
4
5
# File 'lib/langusta/detector.rb', line 3

def verbose
  @verbose
end

Instance Method Details

#append(text) ⇒ Object

Append more text to be recognized.

Parameters:

  • text (UCS2String)

    text to be recognized



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/langusta/detector.rb', line 27

def append(text)
  Guard.klass(text, Array, __method__)

  text = Codepoints.gsub!(text, RegexHelper::URL_REGEX, "\x00\x20")
  text = Codepoints.gsub!(text, RegexHelper::MAIL_REGEX, "\x00\x20")

  text = text.map do |c|
    NGram.normalize(c)
  end
  @text = Codepoints.gsub!(text, RegexHelper::SPACE_REGEX, "\x00\x20")
end

#detectString

Detect the language.

Returns:

  • (String)

    (usually) two-letter code describing the language.



41
42
43
44
# File 'lib/langusta/detector.rb', line 41

def detect
  probabilities = get_probabilities()
  (probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
end