Module: Mongolian

Included in:
String
Defined in:
lib/mongolian.rb,
lib/mongolian/latin.rb,
lib/mongolian/version.rb,
lib/mongolian/stemmify.rb,
lib/mongolian/tokenizer.rb,
lib/mongolian/spellchecker.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

VERSION =
"0.1.1"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.double_constant_check(str) ⇒ Object

蒙古语里没有复辅音是指在书面语单音节中没有复辅音 P89



62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/mongolian/spellchecker.rb', line 62

def self.double_constant_check(str)
  mongolian_str = str
  result = 1
  syllable(mongolian_str).each do |s|
    sv = s.scan(/[ᠠᠡᠢᠣᠤᠥᠦ]/).join
    if s.index(sv) < 2 and s.size - s.index(sv) - sv.size < 2
      next
    else
      result = 0
      break
    end
  end
  return result
end

Instance Method Details

#ae?(str) ⇒ Boolean

_ 用于分写 ᠠ(a)/ᠡ(e) 跟前面的辅音(h/g + a; n, l, m, s, sh, j, y, r , W + a/e)

Returns:

  • (Boolean)


81
82
83
84
85
86
87
# File 'lib/mongolian/spellchecker.rb', line 81

def ae?(str)
  if str =~ /᠎[ᠠᠡ]/
    return true
  else
    false
  end
end

#is_mongolian?Boolean

判断一个词或者字符是否包含蒙古文字符,如果包含蒙古文返回 true,否则返回 nil。

Returns:

  • (Boolean)


5
6
7
# File 'lib/mongolian/tokenizer.rb', line 5

def is_mongolian?
  return true if !!(self =~ /\p{Mongolian}/)
end

#mon_l2mObject

对拉丁转写蒙古文恢复为蒙古文



102
103
104
105
106
# File 'lib/mongolian/latin.rb', line 102

def mon_l2m
  str = self.dup.to_str
  # 将拉丁文转换文对应的蒙古文
  return str
end

#mon_m2lObject

转换传统蒙古文为拉丁转写



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/mongolian/latin.rb', line 9

def mon_m2l
  str = self.dup.to_str
  str = str.gsub(/ᠠ/, "a")
  str = str.gsub(/ᠡ/, "e")
  str = str.gsub(/ᠢ/, "i")
  str = str.gsub(/ᠣ/, "o")
  str = str.gsub(/ᠤ/, "u")
  str = str.gsub(/ᠥ/, "ö")
  str = str.gsub(/ᠦ/, "ü")
  str = str.gsub(/ᠧ/, "ë")
  str = str.gsub(/ᠨ/, "n")
  str = str.gsub(/ᠩ/, "ng")
  str = str.gsub(/ᠪ/, "b")
  str = str.gsub(/ᠫ/, "p")
  str = str.gsub(/ᠬ/, "x")
  str = str.gsub(/ᠭ/, "g")
  str = str.gsub(/ᠮ/, "m")
  str = str.gsub(/ᠯ/, "l")
  str = str.gsub(/ᠰ/, "s")
  str = str.gsub(/ᠱ/, "š")
  str = str.gsub(/ᠲ/, "t")
  str = str.gsub(/ᠳ/, "d")
  str = str.gsub(/ᠴ/, "č")
  str = str.gsub(/ᠵ/, "ǰ")
  str = str.gsub(/ᠶ/, "y")
  str = str.gsub(/ᠷ/, "r")
  str = str.gsub(/ᠸ/, "w")
  str = str.gsub(/ᠹ/, "f")
  str = str.gsub(/ᠺ/, "k")
  str = str.gsub(/ᠻ/, "ḳ")
  str = str.gsub(/ᠼ/, "c")
  str = str.gsub(/ᠽ/, "z")
  str = str.gsub(/ᠾ/, "h")
  str = str.gsub(/ᠿ/, "ž")
  str = str.gsub(/ᡀ/, "lh")
  str = str.gsub(/ᡁ/, "ẑ")
  str = str.gsub(/ᡂ/, "ĉ")
  str = str.gsub(/᠀/, "&")
  str = str.gsub(/᠁/, "…")
  str = str.gsub(/᠂/, ",")
  str = str.gsub(/᠃/, ".")
  str = str.gsub(/᠇/, ":")
  str = str.gsub(/᠈/, "#")
  str = str.gsub(/᠊/, "‐")
  str = str.gsub(/᠋/, "") #fvs1
  str = str.gsub(/᠌/, "") #fvs2
  str = str.gsub(/᠍/, "") #fvs3
  str = str.gsub(/᠎/, "_")
  str = str.gsub(/᠐/, "'0")
  str = str.gsub(/᠑/, "'1")
  str = str.gsub(/᠒/, "'2")
  str = str.gsub(/᠓/, "'3")
  str = str.gsub(/᠔/, "'4")
  str = str.gsub(/᠕/, "'5")
  str = str.gsub(/᠖/, "'6")
  str = str.gsub(/᠗/, "'7")
  str = str.gsub(/᠘/, "'8")
  str = str.gsub(/᠙/, "'9")
  str = str.gsub(/ᡛ/, "ń")
  str = str.gsub(/ᢀ/, "ṃ")
  str = str.gsub(/ᢁ/, "ḥ")
  str = str.gsub(/ᢂ/, "â")
  str = str.gsub(/ᢃ/, "ŏ")
  str = str.gsub(/ᢄ/, "ô")
  str = str.gsub(/ᢅ/, "ˑ")
  str = str.gsub(/ᢆ/, "ːˑ")
  str = str.gsub(/ᢇ/, "ā")
  str = str.gsub(/ᢈ/, "ī")
  str = str.gsub(/ᢉ/, "ḵ")
  str = str.gsub(/ᢊ/, "ṉ")
  str = str.gsub(/ᢋ/, "ƈ")
  str = str.gsub(/ᢌ/, "ť")
  str = str.gsub(/ᢍ/, "ţ")
  str = str.gsub(/ᢎ/, "ḏ")
  str = str.gsub(/ᢏ/, "ņ")
  str = str.gsub(/ᢐ/, "ṯ")
  str = str.gsub(/ᢑ/, "ḓ")
  str = str.gsub(/ᢒ/, "p̄")
  str = str.gsub(/ᢓ/, "ṕh")
  str = str.gsub(/ᢔ/, "ş")
  str = str.gsub(/ᢕ/, "ẖ")
  str = str.gsub(/ᢖ/, "ẕ")
  str = str.gsub(/ᢗ/, "ʒ̄")
  str = str.gsub(/ᢦ/, "ŭ")
  str = str.gsub(/ᢧ/, "ÿ")
  str = str.gsub(/ᢩ/, "̲")
  str = str.gsub(/‌/, "^")
  str = str.gsub(/‍/, "*")
  str = str.gsub(/ /, "-")
  return str
end

#mon_stemmifyObject

对做好分词的蒙古文词进行词干提取。



3
4
5
6
7
8
9
10
11
12
13
# File 'lib/mongolian/stemmify.rb', line 3

def mon_stemmify
  w = self.dup.to_str
  return w if w.length < 4
  
  # 在蒙古文空格后,必定是后缀,删除
  if w =~ /᠎/
    w = $`
  end
  
  return w
end

#mon_syllable_classifyObject

对单词划分音节:每个元音前最多一个辅音前面既可划分音节返回值是音节数组



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/mongolian/spellchecker.rb', line 36

def mon_syllable_classify
  mongolian_str = self.dup.to_str
  mlist = []
  s = ""
  mongolian_str.each_char do |c|
    if c =~ /[ᠠᠡᠢᠣᠤᠥᠦ]/
      if s[-1] =~ /[ᠠᠡᠢᠣᠤᠥᠦ]/
        s += c
      else
        if s[0..-2].size > 0
          mlist << s[0..-2]
          s = s[-1]
          s += c
        else
          s += c
        end
      end
    else
      s += c
    end
  end
  mlist << s if s.size > 0
  return mlist
end

#mon_tokenizeObject

对于混合有多个文种的一段文字,删除其中的非蒙古文字符和蒙古文标点符号后,



12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/mongolian/tokenizer.rb', line 12

def mon_tokenize
  w = self.dup.to_str
  # 去除标点符号,用空格替换标点符号
  w = w.gsub(/[!᠄?·᠃᠂⁈⁉᠁—;《》]/, " ")
  new_w = ""
  # 将字符串中所有非蒙古文字符删除
  w.each_char do |c|
    c = "" unless c.is_mongolian?
    new_w = new_w + c
  end
  return new_w.split
end

#mon_vowel_harmonyObject

测试单词是否符合元音和谐律,并判断词性 参考P87 返回值 0:错误,不符合元音和谐律,外来词、双词根词或者拼写错误返回值 1:正确,且为阳性词返回值 10:阳性词,但有错误,第一音节出现第五元音,却在后面出现了第四元音返回值 2:正确,阴性词返回值 20:阴性词,但有错误,第一音节出现第七元音,却在后面出现了第六元音



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/mongolian/spellchecker.rb', line 10

def mon_vowel_harmony
  mongolian_str = self.dup.to_str
  #vowel = ["ᠠ", "ᠡ", "ᠢ", "ᠣ", "ᠤ", "ᠥ", "ᠦ"]
  if mongolian_str =~ /[ᠠᠣᠤ]/ and mongolian_str =~ /[ᠡᠥᠦ]/
    return 0
  else
    if mongolian_str =~ /[ᠠᠣᠤ]/
      #如果第一音节阳性元音为第 5 元音,却在后面出现第 4 元音,则错误
      if mongolian_str.mon_syllable_classify[0] =~ /ᠤ/ and mongolian_str[mongolian_str.mon_syllable_classify[0].size..-1] =~ /ᠣ/
        return 10
      else
        return 1
      end
    else
      #如果第一音节阳性元音为第 7 元音,却在后面出现第 6 元音,则错误
      if mongolian_str.mon_syllable_classify[0] =~ /ᠦ/ and mongolian_str[mongolian_str.mon_syllable_classify[0].size..-1] =~ /ᠥ/
        return 20
      else
        return 2
      end
    end
  end
end