16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# File 'lib/riktoken/encodings/p50k_edit.rb', line 16
def self.load_encoding(tiktoken_base_dir:)
ranks = TiktokenFile.new.load(find_tiktoken_file(name: TIKTOKEN_SIGNATURE_NAME, base_dir: tiktoken_base_dir))
special_tokens = {
"<|endoftext|>" => 50256,
"<|fim_prefix|>" => 50281,
"<|fim_middle|>" => 50282,
"<|fim_suffix|>" => 50283
}
pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/
Riktoken::Encoding.new(
name: ENCODING_NAME,
ranks: ranks,
special_tokens: special_tokens,
pattern: pattern
)
end
|