Module: ZhongwenTools

Defined in:: lib/zhongwen_tools/numbers.rb,
lib/zhongwen_tools.rb,
lib/zhongwen_tools/regex.rb,
lib/zhongwen_tools/string.rb,
lib/zhongwen_tools/integer.rb,
lib/zhongwen_tools/version.rb,
lib/zhongwen_tools/conversion.rb,
lib/zhongwen_tools/string/caps.rb,
lib/zhongwen_tools/regex/ruby18.rb,
lib/zhongwen_tools/romanization.rb,
lib/zhongwen_tools/string/ruby18.rb,
lib/zhongwen_tools/string/fullwidth.rb,
lib/zhongwen_tools/conversion/string.rb,
lib/zhongwen_tools/romanization/detect.rb,
lib/zhongwen_tools/romanization/string.rb,
lib/zhongwen_tools/romanization/pyn_to_py.rb,
lib/zhongwen_tools/romanization/conversion_table.rb

Overview

NOTE: This table works for pyn -> pinyin conversion, but it introduces

mistakes when converting pinyin to pyn. In practice, pinyin can't
be converted to pyn properly unless it's properly formatted.

Defined Under Namespace

Modules: Conversion, Integer, Numbers, Regex, Romanization, String

Constant Summary collapse

VERSION =

'0.11.1'

UNICODE_CAPS =

{
  'Ā' => 'ā',
  'Á' => 'á',
  'Ǎ' => 'ǎ',
  'À' => 'à',
  'Ē' => 'ē',
  'É' => 'é',
  'Ě' => 'ě',
  'È' => 'è',
  'Ī' => 'ī',
  'Í' => 'í',
  'Ǐ' => 'ǐ',
  'Ì' => 'ì',
  'Ō' => 'ō',
  'Ó' => 'ó',
  'Ǒ' => 'ǒ',
  'Ò' => 'ò',
  'Ǖ' => 'ǖ', # using combining diatrical marks
  'Ǘ' => 'ǘ', # using combining diatrical marks
  'Ǚ' => 'ǚ', # using combining diatrical marks
  'Ǜ' => 'ǜ', # using combining diatrical marks
  'Ū' => 'ū',
  'Ú' => 'ú',
  'Ǔ' => 'ǔ',
  'Ù' => 'ù',
  "Ａ" => "ａ",
  "Ｂ" => "ｂ",
  "Ｃ" => "ｃ",
  "Ｄ" => "ｄ",
  "Ｅ" => "ｅ",
  "Ｆ" => "ｆ",
  "Ｇ" => "ｇ",
  "Ｈ" => "ｈ",
  "Ｉ" => "ｉ",
  "Ｊ" => "ｊ",
  "Ｋ" => "ｋ",
  "Ｌ" => "ｌ",
  "Ｍ" => "ｍ",
  "Ｎ" => "ｎ",
  "Ｏ" => "ｏ",
  "Ｐ" => "ｐ",
  "Ｑ" => "ｑ",
  "Ｒ" => "ｒ",
  "Ｓ" => "ｓ",
  "Ｔ" => "ｔ",
  "Ｕ" => "ｕ",
  "Ｖ" => "ｖ",
  "Ｗ" => "ｗ",
  "Ｘ" => "ｘ",
  "Ｙ" => "ｙ",
  "Ｚ" => "ｚ"
}

FW_HW =

{
  "０" => "0",
  "１" => "1",
  "２" => "2",
  "３" => "3",
  "４" => "4",
  "５" => "5",
  "６" => "6",
  "７" => "7",
  "８" => "8",
  "９" => "9",
  "Ａ" => "A",
  "Ｂ" => "B",
  "Ｃ" => "C",
  "Ｄ" => "D",
  "Ｅ" => "E",
  "Ｆ" => "F",
  "Ｇ" => "G",
  "Ｈ" => "H",
  "Ｉ" => "I",
  "Ｊ" => "J",
  "Ｋ" => "K",
  "Ｌ" => "L",
  "Ｍ" => "M",
  "Ｎ" => "N",
  "Ｏ" => "O",
  "Ｐ" => "P",
  "Ｑ" => "Q",
  "Ｒ" => "R",
  "Ｓ" => "S",
  "Ｔ" => "T",
  "Ｕ" => "U",
  "Ｖ" => "V",
  "Ｗ" => "W",
  "Ｘ" => "X",
  "Ｙ" => "Y",
  "Ｚ" => "Z",
  "ａ" => "a",
  "ｂ" => "b",
  "ｃ" => "c",
  "ｄ" => "d",
  "ｅ" => "e",
  "ｆ" => "f",
  "ｇ" => "g",
  "ｈ" => "h",
  "ｉ" => "i",
  "ｊ" => "j",
  "ｋ" => "k",
  "ｌ" => "l",
  "ｍ" => "m",
  "ｎ" => "n",
  "ｏ" => "o",
  "ｐ" => "p",
  "ｑ" => "q",
  "ｒ" => "r",
  "ｓ" => "s",
  "ｔ" => "t",
  "ｕ" => "u",
  "ｖ" => "v",
  "ｗ" => "w",
  "ｘ" => "x",
  "ｙ" => "y",
  "ｚ" => "z",
  "％" => '%',
  "．" => '.',
  '：' => ':',
  "＃" => '#',
  "＄" => "$",
  "＆" => "&",
  "＋" => "+",
  "－" => "-",
  "／" => "/",
  "＼" => '\\',
  '＝' => '=',
  "；" => ";",
  "＜" => "<",
  "＞" => ">"
}

Instance Method Summary collapse

#convert_regex(regex) ⇒ Object
#has_zh?(str = nil) ⇒ Boolean
#has_zh_punctuation?(str = nil) ⇒ Boolean
#strip_zh_punctuation(str = nil) ⇒ Object
#to_halfwidth(str = nil) ⇒ Object
#to_utf8(encoding = nil, encodings = nil) ⇒ Object

TODO: replace deprecated constant UNICODE_REGEX.
#zh?(str = nil) ⇒ Boolean

Instance Method Details

#convert_regex(regex) ⇒ `Object`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 41

def convert_regex(regex)
  str = regex.to_s
  regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
  /#{str}/
end

#has_zh?(str = nil) ⇒ `Boolean`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 47

def has_zh?(str = nil)
  str ||= self

  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }
  # str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
  !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
end

#has_zh_punctuation?(str = nil) ⇒ `Boolean`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 69

def has_zh_punctuation?(str = nil)
  str ||= self
  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }

  !str[regex[:punc]].nil?
end

#strip_zh_punctuation(str = nil) ⇒ `Object`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 79

def strip_zh_punctuation(str = nil)
  str ||= self

  str.gsub(self.convert_regex(UNICODE_REGEX[:punc]), '')
end

#to_halfwidth(str = nil) ⇒ `Object`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 85

def to_halfwidth(str = nil)
  str ||= self
  matches = str.scan(/([０-９Ａ-Ｚａ-ｚ％．：＃＄＆＋－／＼＝；＜＞])/u).uniq.flatten

  matches.each do |match|
    replacement = FW_HW[match]
    str = str.gsub(match, replacement)
  end

  str
end

#to_utf8(encoding = nil, encodings = nil) ⇒ `Object`

TODO: replace deprecated constant UNICODE_REGEX.

# File 'lib/zhongwen_tools/string/ruby18.rb', line 27

def to_utf8(encoding = nil, encodings = nil)
  # FIXME: should substitute out known bad actors like space
  encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
  encodings = encoding + encodings unless encoding.nil?
  raise 'Unable to Convert' if encodings.size == 0

  begin
    text = Iconv.conv('utf-8', encodings[0], self)
  rescue
    text = self.to_utf8(nil, encodings[1..-1])
  end
  text
end

#zh?(str = nil) ⇒ `Boolean`

# File 'lib/zhongwen_tools/string/ruby18.rb', line 58

def zh?(str = nil)
  str ||= self

  regex = {
    :zh => self.convert_regex(UNICODE_REGEX[:zh]),
    :punc => self.convert_regex(UNICODE_REGEX[:punc])
  }

  !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
end

Module: ZhongwenTools

Overview

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#convert_regex(regex) ⇒ Object

#has_zh?(str = nil) ⇒ Boolean

#has_zh_punctuation?(str = nil) ⇒ Boolean

#strip_zh_punctuation(str = nil) ⇒ Object

#to_halfwidth(str = nil) ⇒ Object

#to_utf8(encoding = nil, encodings = nil) ⇒ Object

#zh?(str = nil) ⇒ Boolean

#convert_regex(regex) ⇒ `Object`

#has_zh?(str = nil) ⇒ `Boolean`

#has_zh_punctuation?(str = nil) ⇒ `Boolean`

#strip_zh_punctuation(str = nil) ⇒ `Object`

#to_halfwidth(str = nil) ⇒ `Object`

#to_utf8(encoding = nil, encodings = nil) ⇒ `Object`

#zh?(str = nil) ⇒ `Boolean`