Class: DiscourseAi::Tokenizer::OpenAiTokenizer

Inherits:
BasicTokenizer show all
Defined in:
lib/discourse_ai/tokenizer/open_ai_tokenizer.rb

Overview

Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API

Direct Known Subclasses

OpenAiCl100kTokenizer

Class Method Summary collapse

Methods inherited from BasicTokenizer

available_llm_tokenizers, size

Class Method Details

.below_limit?(text, limit, strict: false) ⇒ Boolean

Returns:

  • (Boolean)


70
71
72
73
74
75
76
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 70

def below_limit?(text, limit, strict: false)
  # fast track common case, /2 to handle unicode chars
  # than can take more than 1 token per char
  return true if !strict && text.size < limit / 2

  tokenizer.encode(text).length < limit
end

.decode(token_ids) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 20

def decode(token_ids)
  tokenizer.decode(token_ids)
rescue Tiktoken::UnicodeError
  token_ids = token_ids.dup

  # this easy case, we started with a valid sequnce but truncated it on an invalid boundary
  # work backwards removing tokens until we can decode again
  tries = 4
  while tries > 0
    begin
      token_ids.pop
      return tokenizer.decode(token_ids)
    rescue Tiktoken::UnicodeError
      tries -= 1
    end
  end

  # at this point we may have a corrupted sequence so just decode what we can
  token_ids
    .map do |id|
      begin
        tokenizer.decode([id])
      rescue Tiktoken::UnicodeError
        ""
      end
    end
    .join
end

.encode(text) ⇒ Object



16
17
18
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 16

def encode(text)
  tokenizer.encode(text)
end

.tokenize(text) ⇒ Object



12
13
14
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 12

def tokenize(text)
  tokenizer.encode(text)
end

.tokenizerObject



8
9
10
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 8

def tokenizer
  @tokenizer ||= Tiktoken.get_encoding("o200k_base")
end

.truncate(text, max_length, strict: false) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/discourse_ai/tokenizer/open_ai_tokenizer.rb', line 49

def truncate(text, max_length, strict: false)
  return "" if max_length <= 0

  # fast track common case, /2 to handle unicode chars
  # than can take more than 1 token per char
  return text if !strict && text.size < max_length / 2

  # Take tokens up to max_length, decode, then ensure we don't exceed limit
  truncated_tokens = tokenize(text).take(max_length)
  truncated_text = decode(truncated_tokens)

  # If re-encoding exceeds the limit, we need to further truncate
  while tokenize(truncated_text).length > max_length
    truncated_tokens = truncated_tokens[0...-1]
    truncated_text = decode(truncated_tokens)
    break if truncated_tokens.empty?
  end

  truncated_text
end