Module: Riktoken::Encodings::Cl100kBase

Includes:
Riktoken::Encodings
Defined in:
lib/riktoken/encodings/cl100k_base.rb

Class Method Summary collapse

Methods included from Riktoken::Encodings

included

Class Method Details

.load_encoding(tiktoken_base_dir:) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/riktoken/encodings/cl100k_base.rb', line 15

def self.load_encoding(tiktoken_base_dir:)
  ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
  special_tokens = {
    "<|endoftext|>" => 100257,
    "<|fim_prefix|>" => 100258,
    "<|fim_middle|>" => 100259,
    "<|fim_suffix|>" => 100260,
    "<|endofprompt|>" => 100276
  }
  pattern = /'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s/

  Riktoken::Encoding.new(
    name: ENCODING_NAME,
    ranks: ranks,
    special_tokens: special_tokens,
    pattern: pattern
  )
end