Module: Riktoken::Encodings::O200kBase

Includes:
Riktoken::Encodings
Defined in:
lib/riktoken/encodings/o200k_base.rb

Class Method Summary collapse

Methods included from Riktoken::Encodings

included

Class Method Details

.load_encoding(tiktoken_base_dir:) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/riktoken/encodings/o200k_base.rb', line 15

def self.load_encoding(tiktoken_base_dir:)
  ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
  special_tokens = {
    "<|endoftext|>" => 199999,
    "<|endofprompt|>" => 200018
  }
  pattern = Regexp.union([
    /[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?/,
    /[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?/,
    /\p{N}{1,3}/,
    / ?[^\s\p{L}\p{N}]+[\r\n\/]*/,
    /\s*[\r\n]+/,
    /\s+(?!\S)/,
    /\s+/
  ])

  Riktoken::Encoding.new(
    name: ENCODING_NAME,
    ranks: ranks,
    special_tokens: special_tokens,
    pattern: pattern
  )
end