Class: TokenizerRuby::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/tokenizer_ruby/tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path_or_internal) ⇒ Tokenizer

Returns a new instance of Tokenizer.



5
6
7
8
9
10
11
# File 'lib/tokenizer_ruby/tokenizer.rb', line 5

def initialize(path_or_internal)
  if path_or_internal.is_a?(String)
    @inner = InternalTokenizer.from_file(path_or_internal)
  else
    @inner = path_or_internal
  end
end

Class Method Details

.from_file(path) ⇒ Object



17
18
19
# File 'lib/tokenizer_ruby/tokenizer.rb', line 17

def self.from_file(path)
  new(InternalTokenizer.from_file(path))
end

.from_pretrained(identifier) ⇒ Object



13
14
15
# File 'lib/tokenizer_ruby/tokenizer.rb', line 13

def self.from_pretrained(identifier)
  new(InternalTokenizer.from_pretrained(identifier))
end

Instance Method Details

#count(text) ⇒ Object



81
82
83
# File 'lib/tokenizer_ruby/tokenizer.rb', line 81

def count(text)
  encode(text).length
end

#decode(ids, skip_special_tokens: true) ⇒ Object

Raises:



40
41
42
43
44
45
46
47
48
# File 'lib/tokenizer_ruby/tokenizer.rb', line 40

def decode(ids, skip_special_tokens: true)
  raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)

  begin
    @inner._decode(ids, skip_special_tokens)
  rescue => e
    raise TokenizerRuby::TokenizationError, "failed to decode ids: #{e.message}"
  end
end

#decode_batch(ids_array, skip_special_tokens: true) ⇒ Object



65
66
67
# File 'lib/tokenizer_ruby/tokenizer.rb', line 65

def decode_batch(ids_array, skip_special_tokens: true)
  @inner._decode_batch(ids_array, skip_special_tokens)
end

#disable_paddingObject



107
108
109
# File 'lib/tokenizer_ruby/tokenizer.rb', line 107

def disable_padding
  @inner._disable_padding
end

#disable_truncationObject



99
100
101
# File 'lib/tokenizer_ruby/tokenizer.rb', line 99

def disable_truncation
  @inner._disable_truncation
end

#enable_padding(length:, pad_token: "[PAD]") ⇒ Object



103
104
105
# File 'lib/tokenizer_ruby/tokenizer.rb', line 103

def enable_padding(length:, pad_token: "[PAD]")
  @inner._enable_padding(length, pad_token)
end

#enable_truncation(max_length:) ⇒ Object



95
96
97
# File 'lib/tokenizer_ruby/tokenizer.rb', line 95

def enable_truncation(max_length:)
  @inner._enable_truncation(max_length)
end

#encode(text, add_special_tokens: false) ⇒ Object

Raises:



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/tokenizer_ruby/tokenizer.rb', line 21

def encode(text, add_special_tokens: false)
  raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)

  begin
    result = @inner._encode(text, add_special_tokens)
  rescue => e
    raise TokenizerRuby::TokenizationError, "failed to encode text: #{e.message}"
  end
  Encoding.new(
    ids: result[:ids],
    tokens: result[:tokens],
    offsets: result[:offsets],
    attention_mask: result[:attention_mask],
    type_ids: result[:type_ids],
    special_tokens_mask: result[:special_tokens_mask],
    word_ids: result[:word_ids]
  )
end

#encode_batch(texts, add_special_tokens: false) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/tokenizer_ruby/tokenizer.rb', line 50

def encode_batch(texts, add_special_tokens: false)
  results = @inner._encode_batch(texts, add_special_tokens)
  results.map do |result|
    Encoding.new(
      ids: result[:ids],
      tokens: result[:tokens],
      offsets: result[:offsets],
      attention_mask: result[:attention_mask],
      type_ids: result[:type_ids],
      special_tokens_mask: result[:special_tokens_mask],
      word_ids: result[:word_ids]
    )
  end
end

#id_to_token(id) ⇒ Object



77
78
79
# File 'lib/tokenizer_ruby/tokenizer.rb', line 77

def id_to_token(id)
  @inner.id_to_token(id)
end

#token_to_id(token) ⇒ Object



73
74
75
# File 'lib/tokenizer_ruby/tokenizer.rb', line 73

def token_to_id(token)
  @inner.token_to_id(token)
end

#truncate(text, max_tokens:) ⇒ Object

Raises:



85
86
87
88
89
90
91
92
93
# File 'lib/tokenizer_ruby/tokenizer.rb', line 85

def truncate(text, max_tokens:)
  raise TokenizerRuby::ConfigurationError, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0

  encoding = encode(text)
  return text if encoding.length <= max_tokens

  truncated_ids = encoding.ids[0, max_tokens]
  decode(truncated_ids)
end

#vocab_sizeObject



69
70
71
# File 'lib/tokenizer_ruby/tokenizer.rb', line 69

def vocab_size
  @inner.vocab_size
end