Class: TokenizerRuby::Tokenizer

Inherits:

Object

Object
TokenizerRuby::Tokenizer

show all

Defined in:: lib/tokenizer_ruby/tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path_or_internal) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/tokenizer_ruby/tokenizer.rb', line 5

def initialize(path_or_internal)
  if path_or_internal.is_a?(String)
    @inner = InternalTokenizer.from_file(path_or_internal)
  else
    @inner = path_or_internal
  end
end

Class Method Details

.from_file(path) ⇒ `Object`



17
18
19

# File 'lib/tokenizer_ruby/tokenizer.rb', line 17

def self.from_file(path)
  new(InternalTokenizer.from_file(path))
end

.from_pretrained(identifier) ⇒ `Object`



13
14
15

# File 'lib/tokenizer_ruby/tokenizer.rb', line 13

def self.from_pretrained(identifier)
  new(InternalTokenizer.from_pretrained(identifier))
end

Instance Method Details

#count(text) ⇒ `Object`



81
82
83

# File 'lib/tokenizer_ruby/tokenizer.rb', line 81

def count(text)
  encode(text).length
end

#decode(ids, skip_special_tokens: true) ⇒ `Object`

Raises:

# File 'lib/tokenizer_ruby/tokenizer.rb', line 40

def decode(ids, skip_special_tokens: true)
  raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array)

  begin
    @inner._decode(ids, skip_special_tokens)
  rescue => e
    raise TokenizerRuby::TokenizationError, "failed to decode ids: #{e.message}"
  end
end

#decode_batch(ids_array, skip_special_tokens: true) ⇒ `Object`



65
66
67

# File 'lib/tokenizer_ruby/tokenizer.rb', line 65

def decode_batch(ids_array, skip_special_tokens: true)
  @inner._decode_batch(ids_array, skip_special_tokens)
end

#disable_padding ⇒ `Object`



107
108
109

# File 'lib/tokenizer_ruby/tokenizer.rb', line 107

def disable_padding
  @inner._disable_padding
end

#disable_truncation ⇒ `Object`



99
100
101

# File 'lib/tokenizer_ruby/tokenizer.rb', line 99

def disable_truncation
  @inner._disable_truncation
end

#enable_padding(length:, pad_token: "[PAD]") ⇒ `Object`



103
104
105

# File 'lib/tokenizer_ruby/tokenizer.rb', line 103

def enable_padding(length:, pad_token: "[PAD]")
  @inner._enable_padding(length, pad_token)
end

#enable_truncation(max_length:) ⇒ `Object`



95
96
97

# File 'lib/tokenizer_ruby/tokenizer.rb', line 95

def enable_truncation(max_length:)
  @inner._enable_truncation(max_length)
end

#encode(text, add_special_tokens: false) ⇒ `Object`

Raises:

# File 'lib/tokenizer_ruby/tokenizer.rb', line 21

def encode(text, add_special_tokens: false)
  raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String)

  begin
    result = @inner._encode(text, add_special_tokens)
  rescue => e
    raise TokenizerRuby::TokenizationError, "failed to encode text: #{e.message}"
  end
  Encoding.new(
    ids: result[:ids],
    tokens: result[:tokens],
    offsets: result[:offsets],
    attention_mask: result[:attention_mask],
    type_ids: result[:type_ids],
    special_tokens_mask: result[:special_tokens_mask],
    word_ids: result[:word_ids]
  )
end

#encode_batch(texts, add_special_tokens: false) ⇒ `Object`

# File 'lib/tokenizer_ruby/tokenizer.rb', line 50

def encode_batch(texts, add_special_tokens: false)
  results = @inner._encode_batch(texts, add_special_tokens)
  results.map do |result|
    Encoding.new(
      ids: result[:ids],
      tokens: result[:tokens],
      offsets: result[:offsets],
      attention_mask: result[:attention_mask],
      type_ids: result[:type_ids],
      special_tokens_mask: result[:special_tokens_mask],
      word_ids: result[:word_ids]
    )
  end
end

#id_to_token(id) ⇒ `Object`



77
78
79

# File 'lib/tokenizer_ruby/tokenizer.rb', line 77

def id_to_token(id)
  @inner.id_to_token(id)
end

#token_to_id(token) ⇒ `Object`



73
74
75

# File 'lib/tokenizer_ruby/tokenizer.rb', line 73

def token_to_id(token)
  @inner.token_to_id(token)
end

#truncate(text, max_tokens:) ⇒ `Object`

Raises:

# File 'lib/tokenizer_ruby/tokenizer.rb', line 85

def truncate(text, max_tokens:)
  raise TokenizerRuby::ConfigurationError, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0

  encoding = encode(text)
  return text if encoding.length <= max_tokens

  truncated_ids = encoding.ids[0, max_tokens]
  decode(truncated_ids)
end

#vocab_size ⇒ `Object`



69
70
71

# File 'lib/tokenizer_ruby/tokenizer.rb', line 69

def vocab_size
  @inner.vocab_size
end

Class: TokenizerRuby::Tokenizer

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path_or_internal) ⇒ Tokenizer

Class Method Details

.from_file(path) ⇒ Object

.from_pretrained(identifier) ⇒ Object

Instance Method Details

#count(text) ⇒ Object

#decode(ids, skip_special_tokens: true) ⇒ Object

#decode_batch(ids_array, skip_special_tokens: true) ⇒ Object

#disable_padding ⇒ Object

#disable_truncation ⇒ Object

#enable_padding(length:, pad_token: "[PAD]") ⇒ Object

#enable_truncation(max_length:) ⇒ Object

#encode(text, add_special_tokens: false) ⇒ Object

#encode_batch(texts, add_special_tokens: false) ⇒ Object

#id_to_token(id) ⇒ Object

#token_to_id(token) ⇒ Object

#truncate(text, max_tokens:) ⇒ Object

#vocab_size ⇒ Object

#initialize(path_or_internal) ⇒ `Tokenizer`

.from_file(path) ⇒ `Object`

.from_pretrained(identifier) ⇒ `Object`

#count(text) ⇒ `Object`

#decode(ids, skip_special_tokens: true) ⇒ `Object`

#decode_batch(ids_array, skip_special_tokens: true) ⇒ `Object`

#disable_padding ⇒ `Object`

#disable_truncation ⇒ `Object`

#enable_padding(length:, pad_token: "[PAD]") ⇒ `Object`

#enable_truncation(max_length:) ⇒ `Object`

#encode(text, add_special_tokens: false) ⇒ `Object`

#encode_batch(texts, add_special_tokens: false) ⇒ `Object`

#id_to_token(id) ⇒ `Object`

#token_to_id(token) ⇒ `Object`

#truncate(text, max_tokens:) ⇒ `Object`

#vocab_size ⇒ `Object`