Class: TokenizerRuby::Tokenizer
- Inherits:
-
Object
- Object
- TokenizerRuby::Tokenizer
- Defined in:
- lib/tokenizer_ruby/tokenizer.rb
Class Method Summary collapse
Instance Method Summary collapse
- #count(text) ⇒ Object
- #decode(ids, skip_special_tokens: true) ⇒ Object
- #decode_batch(ids_array, skip_special_tokens: true) ⇒ Object
- #disable_padding ⇒ Object
- #disable_truncation ⇒ Object
- #enable_padding(length:, pad_token: "[PAD]") ⇒ Object
- #enable_truncation(max_length:) ⇒ Object
- #encode(text, add_special_tokens: false) ⇒ Object
- #encode_batch(texts, add_special_tokens: false) ⇒ Object
- #id_to_token(id) ⇒ Object
-
#initialize(path_or_internal) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #token_to_id(token) ⇒ Object
- #truncate(text, max_tokens:) ⇒ Object
- #vocab_size ⇒ Object
Constructor Details
#initialize(path_or_internal) ⇒ Tokenizer
Returns a new instance of Tokenizer.
5 6 7 8 9 10 11 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 5 def initialize(path_or_internal) if path_or_internal.is_a?(String) @inner = InternalTokenizer.from_file(path_or_internal) else @inner = path_or_internal end end |
Class Method Details
.from_file(path) ⇒ Object
17 18 19 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 17 def self.from_file(path) new(InternalTokenizer.from_file(path)) end |
.from_pretrained(identifier) ⇒ Object
13 14 15 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 13 def self.from_pretrained(identifier) new(InternalTokenizer.from_pretrained(identifier)) end |
Instance Method Details
#count(text) ⇒ Object
81 82 83 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 81 def count(text) encode(text).length end |
#decode(ids, skip_special_tokens: true) ⇒ Object
40 41 42 43 44 45 46 47 48 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 40 def decode(ids, skip_special_tokens: true) raise TokenizerRuby::Error, "decode expects an Array, got #{ids.class}" unless ids.is_a?(Array) begin @inner._decode(ids, skip_special_tokens) rescue => e raise TokenizerRuby::TokenizationError, "failed to decode ids: #{e.message}" end end |
#decode_batch(ids_array, skip_special_tokens: true) ⇒ Object
65 66 67 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 65 def decode_batch(ids_array, skip_special_tokens: true) @inner._decode_batch(ids_array, skip_special_tokens) end |
#disable_padding ⇒ Object
107 108 109 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 107 def disable_padding @inner._disable_padding end |
#disable_truncation ⇒ Object
99 100 101 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 99 def disable_truncation @inner._disable_truncation end |
#enable_padding(length:, pad_token: "[PAD]") ⇒ Object
103 104 105 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 103 def enable_padding(length:, pad_token: "[PAD]") @inner._enable_padding(length, pad_token) end |
#enable_truncation(max_length:) ⇒ Object
95 96 97 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 95 def enable_truncation(max_length:) @inner._enable_truncation(max_length) end |
#encode(text, add_special_tokens: false) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 21 def encode(text, add_special_tokens: false) raise TokenizerRuby::Error, "encode expects a String, got #{text.class}" unless text.is_a?(String) begin result = @inner._encode(text, add_special_tokens) rescue => e raise TokenizerRuby::TokenizationError, "failed to encode text: #{e.message}" end Encoding.new( ids: result[:ids], tokens: result[:tokens], offsets: result[:offsets], attention_mask: result[:attention_mask], type_ids: result[:type_ids], special_tokens_mask: result[:special_tokens_mask], word_ids: result[:word_ids] ) end |
#encode_batch(texts, add_special_tokens: false) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 50 def encode_batch(texts, add_special_tokens: false) results = @inner._encode_batch(texts, add_special_tokens) results.map do |result| Encoding.new( ids: result[:ids], tokens: result[:tokens], offsets: result[:offsets], attention_mask: result[:attention_mask], type_ids: result[:type_ids], special_tokens_mask: result[:special_tokens_mask], word_ids: result[:word_ids] ) end end |
#id_to_token(id) ⇒ Object
77 78 79 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 77 def id_to_token(id) @inner.id_to_token(id) end |
#token_to_id(token) ⇒ Object
73 74 75 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 73 def token_to_id(token) @inner.token_to_id(token) end |
#truncate(text, max_tokens:) ⇒ Object
85 86 87 88 89 90 91 92 93 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 85 def truncate(text, max_tokens:) raise TokenizerRuby::ConfigurationError, "max_tokens must be positive, got #{max_tokens}" unless max_tokens > 0 encoding = encode(text) return text if encoding.length <= max_tokens truncated_ids = encoding.ids[0, max_tokens] decode(truncated_ids) end |
#vocab_size ⇒ Object
69 70 71 |
# File 'lib/tokenizer_ruby/tokenizer.rb', line 69 def vocab_size @inner.vocab_size end |