Class: DiscourseAi::Tokenizer::OpenAiTokenizer
Overview
Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
Class Method Summary
collapse
available_llm_tokenizers, size
Class Method Details
.below_limit?(text, limit, strict: false) ⇒ Boolean
70
71
72
73
74
75
76
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 70
def below_limit?(text, limit, strict: false)
return true if !strict && text.size < limit / 2
tokenizer.encode(text).length < limit
end
|
.decode(token_ids) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 20
def decode(token_ids)
tokenizer.decode(token_ids)
rescue Tiktoken::UnicodeError
token_ids = token_ids.dup
tries = 4
while tries > 0
begin
token_ids.pop
return tokenizer.decode(token_ids)
rescue Tiktoken::UnicodeError
tries -= 1
end
end
token_ids
.map do |id|
begin
tokenizer.decode([id])
rescue Tiktoken::UnicodeError
""
end
end
.join
end
|
.encode(text) ⇒ Object
16
17
18
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 16
def encode(text)
tokenizer.encode(text)
end
|
.tokenize(text) ⇒ Object
12
13
14
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 12
def tokenize(text)
tokenizer.encode(text)
end
|
.tokenizer ⇒ Object
8
9
10
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 8
def tokenizer
@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end
|
.truncate(text, max_length, strict: false) ⇒ Object
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 49
def truncate(text, max_length, strict: false)
return "" if max_length <= 0
return text if !strict && text.size < max_length / 2
truncated_tokens = tokenize(text).take(max_length)
truncated_text = decode(truncated_tokens)
while tokenize(truncated_text).length > max_length
truncated_tokens = truncated_tokens[0...-1]
truncated_text = decode(truncated_tokens)
break if truncated_tokens.empty?
end
truncated_text
end
|