Module: Riktoken::Encodings::P50kEdit

Includes:
Riktoken::Encodings
Defined in:
lib/riktoken/encodings/p50k_edit.rb

Constant Summary collapse

TIKTOKEN_SIGNATURE_NAME =
"p50k_base"

Class Method Summary collapse

Methods included from Riktoken::Encodings

included

Class Method Details

.load_encoding(tiktoken_base_dir:) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/riktoken/encodings/p50k_edit.rb', line 16

def self.load_encoding(tiktoken_base_dir:)
  ranks = TiktokenFile.new.load(find_tiktoken_file(name: TIKTOKEN_SIGNATURE_NAME, base_dir: tiktoken_base_dir))
  special_tokens = {
    "<|endoftext|>" => 50256,
    "<|fim_prefix|>" => 50281,
    "<|fim_middle|>" => 50282,
    "<|fim_suffix|>" => 50283
  }
  pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/

  Riktoken::Encoding.new(
    name: ENCODING_NAME,
    ranks: ranks,
    special_tokens: special_tokens,
    pattern: pattern
  )
end