Module: JDict::Unicode

Defined in:
lib/unicode.rb

Defined Under Namespace

Modules: CodepointRanges

Instance Method Summary collapse

Instance Method Details

#english?(unicode_string) ⇒ Boolean

Returns:

  • (Boolean)


58
59
60
61
# File 'lib/unicode.rb', line 58

def english?(unicode_string)
  type = script_type?(unicode_string)
  type == :english    
end

#hex_codepoint(unicode_char) ⇒ Object

Get Unicode hex codepoint from a Unicode character



17
18
19
# File 'lib/unicode.rb', line 17

def hex_codepoint(unicode_char)
  unicode_char.unpack("U0U*")[0]
end

#japanese?(unicode_string) ⇒ Boolean

Returns:

  • (Boolean)


54
55
56
57
# File 'lib/unicode.rb', line 54

def japanese?(unicode_string)
  type = script_type?(unicode_string)
  type == :kanji || type == :kana
end

#script_type?(unicode_string) ⇒ Boolean

TODO: write unit test with a variety of strings to ensure this method

returns the expected output

Determine the script of the specified string:

:kanji
:kana
:english

Returns:

  • (Boolean)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/unicode.rb', line 27

def script_type?(unicode_string)
  type = ''

  unicode_string.each_char do |c|
    code = hex_codepoint(c)
    #kana
    if CodepointRanges::HIRAGANA.include?(code)           ||
       CodepointRanges::KATAKANA.include?(code)           ||
       CodepointRanges::KATAKANA_PHONETIC.include?(code)  ||
       CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
       CodepointRanges::PUNCTUATION.include?(code) then
      type = :kana
      break
    #kanji
    elsif CodepointRanges::UNIFIED_CJK.include?(code)        ||
          CodepointRanges::UNIFIED_CJK_EXT_A.include?(code)  ||
          CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
      type = :kanji
    #english
    else
      type = :english
    end
  end

  type
end