Module: ZhongwenTools

Defined in:
lib/zhongwen_tools/numbers.rb,
lib/zhongwen_tools.rb,
lib/zhongwen_tools/regex.rb,
lib/zhongwen_tools/string.rb,
lib/zhongwen_tools/integer.rb,
lib/zhongwen_tools/version.rb,
lib/zhongwen_tools/conversion.rb,
lib/zhongwen_tools/string/caps.rb,
lib/zhongwen_tools/regex/ruby18.rb,
lib/zhongwen_tools/romanization.rb,
lib/zhongwen_tools/string/ruby18.rb,
lib/zhongwen_tools/string/fullwidth.rb,
lib/zhongwen_tools/conversion/string.rb,
lib/zhongwen_tools/romanization/detect.rb,
lib/zhongwen_tools/romanization/string.rb,
lib/zhongwen_tools/romanization/pyn_to_py.rb,
lib/zhongwen_tools/romanization/conversion_table.rb

Overview

NOTE: This table works for pyn -> pinyin conversion, but it introduces

mistakes when converting pinyin to pyn. In practice, pinyin can't
be converted to pyn properly unless it's properly formatted.

Defined Under Namespace

Modules: Conversion, Integer, Numbers, Regex, Romanization, String

Constant Summary collapse

VERSION =
'0.11.1'
UNICODE_CAPS =
{
  'Ā' => 'ā',
  'Á' => 'á',
  'Ǎ' => 'ǎ',
  'À' => 'à',
  'Ē' => 'ē',
  'É' => 'é',
  'Ě' => 'ě',
  'È' => 'è',
  'Ī' => 'ī',
  'Í' => 'í',
  'Ǐ' => 'ǐ',
  'Ì' => 'ì',
  'Ō' => 'ō',
  'Ó' => 'ó',
  'Ǒ' => 'ǒ',
  'Ò' => 'ò',
  'Ǖ' => 'ǖ', # using combining diatrical marks
  'Ǘ' => 'ǘ', # using combining diatrical marks
  'Ǚ' => 'ǚ', # using combining diatrical marks
  'Ǜ' => 'ǜ', # using combining diatrical marks
  'Ū' => 'ū',
  'Ú' => 'ú',
  'Ǔ' => 'ǔ',
  'Ù' => 'ù',
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => "",
  "" => ""
}
FW_HW =
{
  "" => "0",
  "" => "1",
  "" => "2",
  "" => "3",
  "" => "4",
  "" => "5",
  "" => "6",
  "" => "7",
  "" => "8",
  "" => "9",
  "" => "A",
  "" => "B",
  "" => "C",
  "" => "D",
  "" => "E",
  "" => "F",
  "" => "G",
  "" => "H",
  "" => "I",
  "" => "J",
  "" => "K",
  "" => "L",
  "" => "M",
  "" => "N",
  "" => "O",
  "" => "P",
  "" => "Q",
  "" => "R",
  "" => "S",
  "" => "T",
  "" => "U",
  "" => "V",
  "" => "W",
  "" => "X",
  "" => "Y",
  "" => "Z",
  "" => "a",
  "" => "b",
  "" => "c",
  "" => "d",
  "" => "e",
  "" => "f",
  "" => "g",
  "" => "h",
  "" => "i",
  "" => "j",
  "" => "k",
  "" => "l",
  "" => "m",
  "" => "n",
  "" => "o",
  "" => "p",
  "" => "q",
  "" => "r",
  "" => "s",
  "" => "t",
  "" => "u",
  "" => "v",
  "" => "w",
  "" => "x",
  "" => "y",
  "" => "z",
  "" => '%',
  "" => '.',
  '' => ':',
  "" => '#',
  "" => "$",
  "" => "&",
  "" => "+",
  "" => "-",
  "" => "/",
  "" => '\\',
  '' => '=',
  "" => ";",
  "" => "<",
  "" => ">"
}

Instance Method Summary collapse

Instance Method Details

#convert_regex(regex) ⇒ Object



41
42
43
44
45
# File 'lib/zhongwen_tools/string/ruby18.rb', line 41

def convert_regex(regex)
  str = regex.to_s
  regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
  /#{str}/
end

#has_zh?(str = nil) ⇒ Boolean



47
48
49
50
51
52
53
54
55
56
# File 'lib/zhongwen_tools/string/ruby18.rb', line 47

def has_zh?(str = nil)
  str ||= self

  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }
  # str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
  !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
end

#has_zh_punctuation?(str = nil) ⇒ Boolean



69
70
71
72
73
74
75
76
77
# File 'lib/zhongwen_tools/string/ruby18.rb', line 69

def has_zh_punctuation?(str = nil)
  str ||= self
  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }

  !str[regex[:punc]].nil?
end

#strip_zh_punctuation(str = nil) ⇒ Object



79
80
81
82
83
# File 'lib/zhongwen_tools/string/ruby18.rb', line 79

def strip_zh_punctuation(str = nil)
  str ||= self

  str.gsub(self.convert_regex(UNICODE_REGEX[:punc]), '')
end

#to_halfwidth(str = nil) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
# File 'lib/zhongwen_tools/string/ruby18.rb', line 85

def to_halfwidth(str = nil)
  str ||= self
  matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten

  matches.each do |match|
    replacement = FW_HW[match]
    str = str.gsub(match, replacement)
  end

  str
end

#to_utf8(encoding = nil, encodings = nil) ⇒ Object

TODO: replace deprecated constant UNICODE_REGEX.



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/zhongwen_tools/string/ruby18.rb', line 27

def to_utf8(encoding = nil, encodings = nil)
  # FIXME: should substitute out known bad actors like space
  encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
  encodings = encoding + encodings unless encoding.nil?
  raise 'Unable to Convert' if encodings.size == 0

  begin
    text = Iconv.conv('utf-8', encodings[0], self)
  rescue
    text = self.to_utf8(nil, encodings[1..-1])
  end
  text
end

#zh?(str = nil) ⇒ Boolean



58
59
60
61
62
63
64
65
66
67
# File 'lib/zhongwen_tools/string/ruby18.rb', line 58

def zh?(str = nil)
  str ||= self

  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }

  !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
end