15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
# File 'lib/riktoken/encodings/o200k_base.rb', line 15
def self.load_encoding(tiktoken_base_dir:)
ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
special_tokens = {
"<|endoftext|>" => 199999,
"<|endofprompt|>" => 200018
}
pattern = Regexp.union([
/[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?/,
/[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?/,
/\p{N}{1,3}/,
/ ?[^\s\p{L}\p{N}]+[\r\n\/]*/,
/\s*[\r\n]+/,
/\s+(?!\S)/,
/\s+/
])
Riktoken::Encoding.new(
name: ENCODING_NAME,
ranks: ranks,
special_tokens: special_tokens,
pattern: pattern
)
end
|