Class: Langusta::LangProfile

Inherits:
Object
  • Object
show all
Defined in:
lib/langusta/lang_profile.rb

Constant Summary collapse

MINIMUM_FREQ =
2
LESS_FREQ_RATIO =
100_000

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name, freq = {}, n_words = Array.new(NGram::N_GRAM, 0)) ⇒ LangProfile

Returns a new instance of LangProfile.



26
27
28
29
# File 'lib/langusta/lang_profile.rb', line 26

def initialize(name, freq={}, n_words = Array.new(NGram::N_GRAM, 0))
  Guard.klass(name, String, __method__)
  @name, @freq, @n_words = name, freq, n_words
end

Instance Attribute Details

#freqObject (readonly)

Returns the value of attribute freq.



7
8
9
# File 'lib/langusta/lang_profile.rb', line 7

def freq
  @freq
end

#n_wordsObject (readonly)

Returns the value of attribute n_words.



7
8
9
# File 'lib/langusta/lang_profile.rb', line 7

def n_words
  @n_words
end

#nameObject (readonly)

Returns the value of attribute name.



7
8
9
# File 'lib/langusta/lang_profile.rb', line 7

def name
  @name
end

Class Method Details

.load_from_file(filename) ⇒ LangProfile

Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.

Parameters:

  • file (String)

    name of the language profile.

Returns:



12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/langusta/lang_profile.rb', line 12

def self.load_from_file(filename)
  json = Yajl::Parser.parse(File.new(filename))

  freq = json['freq'].inject({}) do |acc, kv|
    key, value = kv
    acc[Langusta.utf82cp(key)] = value
    acc
  end

  self.new(json['name'] || (raise CorruptProfileError.new("Missing profile name")),
           freq,
           json['n_words'] || (raise CorruptProfileError.new("Missing number of words value")))
end

Instance Method Details

#add(gram) ⇒ Object

Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.

Parameters:

  • gram (Array<Fixnum>)


33
34
35
36
37
38
39
40
41
42
# File 'lib/langusta/lang_profile.rb', line 33

def add(gram)
  return if gram.nil?
  Guard.klass(gram, Array, __method__)

  length = gram.size
  return if length < 1 or length > NGram::N_GRAM
  @n_words[length - 1] += 1
  @freq[gram] ||= 0
  @freq[gram] += 1
end

#omit_less_freqObject



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/langusta/lang_profile.rb', line 44

def omit_less_freq
  threshold = @n_words[0] / LESS_FREQ_RATIO
  threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
  keys = Set.new(@freq.keys)
  roman = 0
  keys.each do |key|
    count = @freq[key]
    if count <= threshold
      @n_words[key.size - 1] -= count
      @freq.delete(key)
    else
      # temp workaround
      if RegexHelper::ROMAN_REGEX.match(Langusta.cp2utf8(key))
        roman += count
      end
    end
  end

  if roman < @n_words[0] / 3
    keys2 = Set.new(@freq.keys)
    keys2.each do |key|
      # temp workaround
      if RegexHelper::INCL_ROMAN_REGEX.match(Langusta.cp2utf8(key))
        @n_words[key.size - 1] -= @freq[key]
        @freq.delete(key)
      end
    end
  end
end