Module: ZhongwenTools
- Defined in:
- lib/zhongwen_tools/numbers.rb,
lib/zhongwen_tools.rb,
lib/zhongwen_tools/regex.rb,
lib/zhongwen_tools/string.rb,
lib/zhongwen_tools/integer.rb,
lib/zhongwen_tools/version.rb,
lib/zhongwen_tools/conversion.rb,
lib/zhongwen_tools/string/caps.rb,
lib/zhongwen_tools/regex/ruby18.rb,
lib/zhongwen_tools/romanization.rb,
lib/zhongwen_tools/string/ruby18.rb,
lib/zhongwen_tools/string/fullwidth.rb,
lib/zhongwen_tools/conversion/string.rb,
lib/zhongwen_tools/romanization/detect.rb,
lib/zhongwen_tools/romanization/string.rb,
lib/zhongwen_tools/romanization/pyn_to_py.rb,
lib/zhongwen_tools/romanization/conversion_table.rb
Overview
NOTE: This table works for pyn -> pinyin conversion, but it introduces
mistakes when converting pinyin to pyn. In practice, pinyin can't
be converted to pyn properly unless it's properly formatted.
Defined Under Namespace
Modules: Conversion, Integer, Numbers, Regex, Romanization, String
Constant Summary collapse
- VERSION =
'0.11.1'- UNICODE_CAPS =
{ 'Ā' => 'ā', 'Á' => 'á', 'Ǎ' => 'ǎ', 'À' => 'à', 'Ē' => 'ē', 'É' => 'é', 'Ě' => 'ě', 'È' => 'è', 'Ī' => 'ī', 'Í' => 'í', 'Ǐ' => 'ǐ', 'Ì' => 'ì', 'Ō' => 'ō', 'Ó' => 'ó', 'Ǒ' => 'ǒ', 'Ò' => 'ò', 'Ǖ' => 'ǖ', # using combining diatrical marks 'Ǘ' => 'ǘ', # using combining diatrical marks 'Ǚ' => 'ǚ', # using combining diatrical marks 'Ǜ' => 'ǜ', # using combining diatrical marks 'Ū' => 'ū', 'Ú' => 'ú', 'Ǔ' => 'ǔ', 'Ù' => 'ù', "A" => "a", "B" => "b", "C" => "c", "D" => "d", "E" => "e", "F" => "f", "G" => "g", "H" => "h", "I" => "i", "J" => "j", "K" => "k", "L" => "l", "M" => "m", "N" => "n", "O" => "o", "P" => "p", "Q" => "q", "R" => "r", "S" => "s", "T" => "t", "U" => "u", "V" => "v", "W" => "w", "X" => "x", "Y" => "y", "Z" => "z" }
- FW_HW =
{ "0" => "0", "1" => "1", "2" => "2", "3" => "3", "4" => "4", "5" => "5", "6" => "6", "7" => "7", "8" => "8", "9" => "9", "A" => "A", "B" => "B", "C" => "C", "D" => "D", "E" => "E", "F" => "F", "G" => "G", "H" => "H", "I" => "I", "J" => "J", "K" => "K", "L" => "L", "M" => "M", "N" => "N", "O" => "O", "P" => "P", "Q" => "Q", "R" => "R", "S" => "S", "T" => "T", "U" => "U", "V" => "V", "W" => "W", "X" => "X", "Y" => "Y", "Z" => "Z", "a" => "a", "b" => "b", "c" => "c", "d" => "d", "e" => "e", "f" => "f", "g" => "g", "h" => "h", "i" => "i", "j" => "j", "k" => "k", "l" => "l", "m" => "m", "n" => "n", "o" => "o", "p" => "p", "q" => "q", "r" => "r", "s" => "s", "t" => "t", "u" => "u", "v" => "v", "w" => "w", "x" => "x", "y" => "y", "z" => "z", "%" => '%', "." => '.', ':' => ':', "#" => '#', "$" => "$", "&" => "&", "+" => "+", "-" => "-", "/" => "/", "\" => '\\', '=' => '=', ";" => ";", "<" => "<", ">" => ">" }
Instance Method Summary collapse
- #convert_regex(regex) ⇒ Object
- #has_zh?(str = nil) ⇒ Boolean
- #has_zh_punctuation?(str = nil) ⇒ Boolean
- #strip_zh_punctuation(str = nil) ⇒ Object
- #to_halfwidth(str = nil) ⇒ Object
-
#to_utf8(encoding = nil, encodings = nil) ⇒ Object
TODO: replace deprecated constant UNICODE_REGEX.
- #zh?(str = nil) ⇒ Boolean
Instance Method Details
#convert_regex(regex) ⇒ Object
41 42 43 44 45 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 41 def convert_regex(regex) str = regex.to_s regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)} /#{str}/ end |
#has_zh?(str = nil) ⇒ Boolean
47 48 49 50 51 52 53 54 55 56 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 47 def has_zh?(str = nil) str ||= self regex = { :zh => self.convert_regex(UNICODE_REGEX[:zh]), :punc => self.convert_regex(UNICODE_REGEX[:punc]) } # str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str !self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?) end |
#has_zh_punctuation?(str = nil) ⇒ Boolean
69 70 71 72 73 74 75 76 77 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 69 def has_zh_punctuation?(str = nil) str ||= self regex = { :zh => self.convert_regex(UNICODE_REGEX[:zh]), :punc => self.convert_regex(UNICODE_REGEX[:punc]) } !str[regex[:punc]].nil? end |
#strip_zh_punctuation(str = nil) ⇒ Object
79 80 81 82 83 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 79 def strip_zh_punctuation(str = nil) str ||= self str.gsub(self.convert_regex(UNICODE_REGEX[:punc]), '') end |
#to_halfwidth(str = nil) ⇒ Object
85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 85 def to_halfwidth(str = nil) str ||= self matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten matches.each do |match| replacement = FW_HW[match] str = str.gsub(match, replacement) end str end |
#to_utf8(encoding = nil, encodings = nil) ⇒ Object
TODO: replace deprecated constant UNICODE_REGEX.
27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 27 def to_utf8(encoding = nil, encodings = nil) # FIXME: should substitute out known bad actors like space encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil? encodings = encoding + encodings unless encoding.nil? raise 'Unable to Convert' if encodings.size == 0 begin text = Iconv.conv('utf-8', encodings[0], self) rescue text = self.to_utf8(nil, encodings[1..-1]) end text end |
#zh?(str = nil) ⇒ Boolean
58 59 60 61 62 63 64 65 66 67 |
# File 'lib/zhongwen_tools/string/ruby18.rb', line 58 def zh?(str = nil) str ||= self regex = { :zh => self.convert_regex(UNICODE_REGEX[:zh]), :punc => self.convert_regex(UNICODE_REGEX[:punc]) } !str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str) end |