Module: Riktoken

Defined in:
lib/riktoken.rb,
lib/riktoken/bpe.rb,
lib/riktoken/version.rb,
lib/riktoken/encoding.rb,
lib/riktoken/encodings.rb,
lib/riktoken/tiktoken_file.rb,
lib/riktoken/encodings/p50k_base.rb,
lib/riktoken/encodings/p50k_edit.rb,
lib/riktoken/encodings/r50k_base.rb,
lib/riktoken/encodings/o200k_base.rb,
lib/riktoken/encodings/cl100k_base.rb

Defined Under Namespace

Modules: Encodings Classes: BPE, Encoding, TiktokenFile, UnknownEncodingError, UnknownModelError

Constant Summary collapse

VERSION =
"0.0.1"

Class Method Summary collapse

Class Method Details

.default_tiktoken_base_dirObject



151
152
153
# File 'lib/riktoken.rb', line 151

def default_tiktoken_base_dir
  ENV[TIKTOKEN_BASE_DIR_ENV_KEY] || DEFAULT_TIKTOKEN_BASE_DIR
end

.encoding_for_model(model_name, tiktoken_base_dir: default_tiktoken_base_dir) ⇒ Object

Raises:



98
99
100
101
102
103
# File 'lib/riktoken.rb', line 98

def encoding_for_model(model_name, tiktoken_base_dir: default_tiktoken_base_dir)
  encoding_name = MODEL_TO_ENCODING[model_name]
  raise UnknownModelError, "Unknown model: #{model_name}" unless encoding_name

  get_encoding(encoding_name, tiktoken_base_dir:)
end

.encoding_from_file(path:, name:, pattern:, special_tokens: {}) ⇒ Object



124
125
126
127
128
129
130
131
132
133
134
# File 'lib/riktoken.rb', line 124

def encoding_from_file(path:, name:, pattern:, special_tokens: {})
  parser = TiktokenFile.new
  ranks = parser.load(path)

  Encoding.new(
    name:,
    ranks:,
    special_tokens:,
    pattern:
  )
end

.get_encoding(encoding_name, tiktoken_base_dir: default_tiktoken_base_dir) ⇒ Object

Get the encoding by name (like “cl100k_base”).



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/riktoken.rb', line 76

def get_encoding(encoding_name, tiktoken_base_dir: default_tiktoken_base_dir)
  enc_class = case encoding_name
  when "cl100k_base"
    Encodings::Cl100kBase
  when "p50k_base"
    Encodings::P50kBase
  when "p50k_edit"
    Encodings::P50kEdit
  when "r50k_base"
    Encodings::R50kBase
  when "o200k_base"
    Encodings::O200kBase
  else
    raise UnknownEncodingError, "Unknown encoding: #{encoding_name}"
  end

  enc_class.load_encoding(tiktoken_base_dir:)
end

.list_encoding_namesObject



137
138
139
# File 'lib/riktoken.rb', line 137

def list_encoding_names
  %w[cl100k_base p50k_base p50k_edit r50k_base o200k_base]
end

.list_model_namesObject



142
143
144
# File 'lib/riktoken.rb', line 142

def list_model_names
  MODEL_TO_ENCODING.keys
end

.make_encoding(name:, ranks:, pattern:, special_tokens: {}) ⇒ Object



110
111
112
113
114
115
116
117
# File 'lib/riktoken.rb', line 110

def make_encoding(name:, ranks:, pattern:, special_tokens: {})
  Encoding.new(
    name:,
    ranks:,
    special_tokens:,
    pattern:
  )
end