Module: Riktoken::Encodings::P50kBase

Includes:
Riktoken::Encodings
Defined in:
lib/riktoken/encodings/p50k_base.rb

Class Method Summary collapse

Methods included from Riktoken::Encodings

included

Class Method Details

.load_encoding(tiktoken_base_dir:) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/riktoken/encodings/p50k_base.rb', line 15

def self.load_encoding(tiktoken_base_dir:)
  ranks = TiktokenFile.new.load(find_tiktoken_file(name: ENCODING_NAME, base_dir: tiktoken_base_dir))
  special_tokens = {
    "<|endoftext|>" => 50256
  }
  pattern = /'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s/

  Riktoken::Encoding.new(
    name: ENCODING_NAME,
    ranks: ranks,
    special_tokens: special_tokens,
    pattern: pattern
  )
end