Class: Unicoder::Builder::Name

Inherits:

Object

Object
Unicoder::Builder::Name

show all

Includes:: Unicoder::Builder, ReplaceCommonWords

Defined in:: lib/unicoder/builders/name.rb

Constant Summary collapse

JAMO_INITIAL =

JAMO_MEDIAL =

JAMO_FINAL =

JAMO_END =

CJK =

"CJK UNIFIED IDEOGRAPH-"

TANGUT =

"TANGUT IDEOGRAPH-"

REPLACE_COUNT =

REPLACE_BASE =

?[.ord

Instance Attribute Summary

Attributes included from Unicoder::Builder

#formats, #index, #option

Instance Method Summary collapse

Methods included from ReplaceCommonWords

#replace_common_words!

Methods included from Unicoder::Builder

#assign, #assign_codepoint, build, #export, #initialize, #meta, #parse_file

Instance Method Details

#initialize_index ⇒ `Object`

# File 'lib/unicoder/builders/name.rb', line 19

def initialize_index
  @index = {
    NAMES: {},
    ALIASES: {},
    # HANGUL: [],
    CP_RANGES: {
      CJK => [], # filled while parsing
      TANGUT => [], # filled while parsing
      "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
      "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
      "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
      "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
    },
    # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
    JAMO: {
      INITIAL: [],
      MEDIAL: [],
      FINAL: [""],
    },
  }
  @words = []
  @range_start = nil
end

#parse! ⇒ `Object`

# File 'lib/unicoder/builders/name.rb', line 43

def parse!
  if option =~ /charkeys/
    get_key = ->(codepoint){ [codepoint].pack("U*") }
  else
    get_key = -> (codepoint){ codepoint }
  end

  parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
    if line["name"][0] == "<" && line["name"][-1] == ">"
      if line["name"] =~ /First/
        @range_start = line["codepoint"].to_i(16)
      elsif line["name"] =~ /Last/ && @range_start
        case line["name"]
        when /Hangul/
          # currently not necessary
          # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
        when /CJK/
          @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
        when /Tangut/
          @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
        else
          # no name
          warn "ignoring range: #{line["name"]}"
        end
        @range_start = nil
      elsif line["name"] != "<control>"
        raise ArgumentError, "inconsistent range found in data, don't know what to do"
      end
    elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
      # ignore
    else
      assign :NAMES, line["codepoint"].to_i(16), line["name"]
      @words += line["name"].split
    end
  end

  replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE

  parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
    @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
    @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
    @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
  end

  parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
    case line["codepoint"].to_i(16)
    when JAMO_INITIAL...JAMO_MEDIAL
      @index[:JAMO][:INITIAL] << line["short_name"]
    when JAMO_MEDIAL...JAMO_FINAL
      @index[:JAMO][:MEDIAL] << line["short_name"]
    when JAMO_FINAL..JAMO_END
      @index[:JAMO][:FINAL] << line["short_name"]
    end
  end
end

Class: Unicoder::Builder::Name

Constant Summary collapse

Instance Attribute Summary

Attributes included from Unicoder::Builder

Instance Method Summary collapse

Methods included from ReplaceCommonWords

Methods included from Unicoder::Builder

Instance Method Details

#initialize_index ⇒ Object

#parse! ⇒ Object

#initialize_index ⇒ `Object`

#parse! ⇒ `Object`