Class: DiscourseAi::Tokenizer::BasicTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/discourse_ai/tokenizer/basic_tokenizer.rb

Overview

Base class for tokenizers to inherit from

Class Method Summary collapse

Class Method Details

.available_llm_tokenizersObject



8
9
10
11
12
13
14
15
16
17
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 8

def available_llm_tokenizers
  [
    DiscourseAi::Tokenizer::AnthropicTokenizer,
    DiscourseAi::Tokenizer::GeminiTokenizer,
    DiscourseAi::Tokenizer::Llama3Tokenizer,
    DiscourseAi::Tokenizer::MistralTokenizer,
    DiscourseAi::Tokenizer::OpenAiTokenizer,
    DiscourseAi::Tokenizer::QwenTokenizer
  ]
end

.below_limit?(text, limit, strict: false) ⇒ Boolean

Returns:

  • (Boolean)


60
61
62
63
64
65
66
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 60

def below_limit?(text, limit, strict: false)
  # fast track common case, /2 to handle unicode chars
  # than can take more than 1 token per char
  return true if !strict && text.size < limit / 2

  tokenizer.encode(text).ids.length < limit
end

.decode(token_ids) ⇒ Object



31
32
33
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 31

def decode(token_ids)
  tokenizer.decode(token_ids)
end

.encode(tokens) ⇒ Object



35
36
37
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 35

def encode(tokens)
  tokenizer.encode(tokens).ids
end

.size(text) ⇒ Object



27
28
29
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 27

def size(text)
  tokenize(text).size
end

.tokenize(text) ⇒ Object



23
24
25
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 23

def tokenize(text)
  tokenizer.encode(text).tokens
end

.tokenizerObject

Raises:

  • (NotImplementedError)


19
20
21
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 19

def tokenizer
  raise NotImplementedError
end

.truncate(text, max_length, strict: false) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/discourse_ai/tokenizer/basic_tokenizer.rb', line 39

def truncate(text, max_length, strict: false)
  return "" if max_length <= 0

  # fast track common case, /2 to handle unicode chars
  # than can take more than 1 token per char
  return text if !strict && text.size < max_length / 2

  # Take tokens up to max_length, decode, then ensure we don't exceed limit
  truncated_tokens = tokenizer.encode(text).ids.take(max_length)
  truncated_text = tokenizer.decode(truncated_tokens)

  # If re-encoding exceeds the limit, we need to further truncate
  while tokenizer.encode(truncated_text).ids.length > max_length
    truncated_tokens = truncated_tokens[0...-1]
    truncated_text = tokenizer.decode(truncated_tokens)
    break if truncated_tokens.empty?
  end

  truncated_text
end