15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# File 'lib/riktoken/encodings/cl100k_base.rb', line 15
def self.load_encoding(tiktoken_base_dir:)
ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
special_tokens = {
"<|endoftext|>" => 100257,
"<|fim_prefix|>" => 100258,
"<|fim_middle|>" => 100259,
"<|fim_suffix|>" => 100260,
"<|endofprompt|>" => 100276
}
pattern = /'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s/
Riktoken::Encoding.new(
name: ENCODING_NAME,
ranks: ranks,
special_tokens: special_tokens,
pattern: pattern
)
end
|