Module: UnicodeNamecode::DataLoader

Defined in:
lib/unicode_namecode/data_loader.rb

Overview

Handles data loading, caching, and parallel parsing

Constant Summary collapse

DATA_PATH =
File.expand_path('../../../data/UnicodeData.txt', __FILE__)
CACHE_PATH =
File.expand_path('../../../data/unicode_trie.cache', __FILE__)

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.all_namesObject (readonly)

Returns the value of attribute all_names.



19
20
21
# File 'lib/unicode_namecode/data_loader.rb', line 19

def all_names
  @all_names
end

.codepoint_to_nameObject (readonly)

Returns the value of attribute codepoint_to_name.



19
20
21
# File 'lib/unicode_namecode/data_loader.rb', line 19

def codepoint_to_name
  @codepoint_to_name
end

.fuzzyObject (readonly)

Returns the value of attribute fuzzy.



19
20
21
# File 'lib/unicode_namecode/data_loader.rb', line 19

def fuzzy
  @fuzzy
end

.trieObject (readonly)

Returns the value of attribute trie.



19
20
21
# File 'lib/unicode_namecode/data_loader.rb', line 19

def trie
  @trie
end

Class Method Details

.collect_all_namesObject

Collect all Unicode names from the Trie for fuzzy matching



72
73
74
75
76
# File 'lib/unicode_namecode/data_loader.rb', line 72

def collect_all_names
  names = []
  collect_names_recursive(@trie.instance_variable_get(:@root), "", names)
  names
end

.collect_codepoint_to_name(node, current) ⇒ Object

Build the reverse lookup map: codepoint -> Unicode name



87
88
89
90
91
92
93
94
# File 'lib/unicode_namecode/data_loader.rb', line 87

def collect_codepoint_to_name(node, current)
  if node.is_end && node.codepoint
    @codepoint_to_name[node.codepoint] = current.upcase
  end
  node.children.each do |char, child|
    collect_codepoint_to_name(child, current + char)
  end
end

.collect_names_recursive(node, current, names) ⇒ Object

Recursively traverse the Trie to collect all complete Unicode names



79
80
81
82
83
84
# File 'lib/unicode_namecode/data_loader.rb', line 79

def collect_names_recursive(node, current, names)
  names << current if node.is_end
  node.children.each do |char, child|
    collect_names_recursive(child, current + char, names)
  end
end

.load_dataObject

Main data loading method - handles cache loading and fresh parsing



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/unicode_namecode/data_loader.rb', line 22

def load_data
  if File.exist?(CACHE_PATH)
    File.open(CACHE_PATH, 'rb') { |f| @trie = Marshal.load(f) }
    @all_names = collect_all_names
    @fuzzy = FuzzyMatch.new(@all_names)
    @codepoint_to_name = {}
    collect_codepoint_to_name(@trie.instance_variable_get(:@root), "")
    return
  end
  
  # First run: parse UnicodeData.txt and build everything from scratch
  @trie = Trie.new
  @codepoint_to_name = {}
  
  # Use parallel parsing to speed up the initial load
  lines = File.readlines(DATA_PATH)
  n_threads = [Etc.nprocessors, 2].max
  chunk_size = (lines.size.to_f / n_threads).ceil
  chunks = lines.each_slice(chunk_size).to_a
  results = Array.new(n_threads) { [] }
  
  # Parse chunks in parallel threads
  threads = chunks.each_with_index.map do |chunk, idx|
    Thread.new do
      chunk.each do |line|
        fields = line.chomp.split(';')
        codepoint = fields[0]
        name = fields[1]
        next if name =~ /<.*>/
        
        results[idx] << [name.upcase, codepoint.to_i(16)]
        @codepoint_to_name[codepoint.to_i(16)] = name.upcase
      end
    end
  end
  
  threads.each(&:join)
  
  # Insert all parsed data into the Trie
  results.flatten(1).each { |name, codepoint| @trie.insert(name, codepoint) }
  
  # Cache the built Trie for future fast loads
  File.open(CACHE_PATH, 'wb') { |f| Marshal.dump(@trie, f) }
  
  # Build additional data structures
  @all_names = collect_all_names
  @fuzzy = FuzzyMatch.new(@all_names)
end