Class: NanoGPT::CharTokenizer

Inherits:
Tokenizer show all
Defined in:
lib/nano_gpt/tokenizer.rb

Overview

Character-level tokenizer

Instance Attribute Summary collapse

Attributes inherited from Tokenizer

#vocab_size

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Tokenizer

for_dataset

Constructor Details

#initialize(stoi: nil, itos: nil) ⇒ CharTokenizer

Returns a new instance of CharTokenizer.



34
35
36
37
38
39
# File 'lib/nano_gpt/tokenizer.rb', line 34

def initialize(stoi: nil, itos: nil)
  super()
  @stoi = stoi || {}
  @itos = itos || {}
  @vocab_size = @stoi.size
end

Instance Attribute Details

#itosObject (readonly)

Returns the value of attribute itos.



32
33
34
# File 'lib/nano_gpt/tokenizer.rb', line 32

def itos
  @itos
end

#stoiObject (readonly)

Returns the value of attribute stoi.



32
33
34
# File 'lib/nano_gpt/tokenizer.rb', line 32

def stoi
  @stoi
end

Class Method Details

.from_file(path) ⇒ Object

Load from meta.json file



50
51
52
53
54
55
# File 'lib/nano_gpt/tokenizer.rb', line 50

def self.from_file(path)
  meta = JSON.parse(File.read(path))
  # Convert string keys to integers for itos
  itos = meta["itos"].transform_keys(&:to_i)
  new(stoi: meta["stoi"], itos: itos)
end

.from_text(text) ⇒ Object

Build vocabulary from text



42
43
44
45
46
47
# File 'lib/nano_gpt/tokenizer.rb', line 42

def self.from_text(text)
  chars = text.chars.uniq.sort
  stoi = chars.each_with_index.to_h
  itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
  new(stoi: stoi, itos: itos)
end

Instance Method Details

#decode(ids) ⇒ Object

Decode list of integers to string



63
64
65
# File 'lib/nano_gpt/tokenizer.rb', line 63

def decode(ids)
  ids.map { |i| @itos[i] }.join
end

#encode(text) ⇒ Object

Encode string to list of integers



58
59
60
# File 'lib/nano_gpt/tokenizer.rb', line 58

def encode(text)
  text.chars.map { |c| @stoi[c] }
end

#save(path) ⇒ Object

Save to meta.json file



68
69
70
71
72
73
74
75
# File 'lib/nano_gpt/tokenizer.rb', line 68

def save(path)
  meta = {
    "vocab_size" => @vocab_size,
    "stoi" => @stoi,
    "itos" => @itos.transform_keys(&:to_s)
  }
  File.write(path, JSON.pretty_generate(meta))
end