Class: Informers::AutoTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/informers/tokenizers.rb

Constant Summary collapse

TOKENIZER_CLASS_MAPPING =
{
  "T5Tokenizer" => T5Tokenizer,
  "BertTokenizer" => BertTokenizer,
  "DebertaV2Tokenizer" => DebertaV2Tokenizer,
  "DistilBertTokenizer" => DistilBertTokenizer,
  "BartTokenizer" => BartTokenizer,
  "RobertaTokenizer" => RobertaTokenizer,
  "XLMRobertaTokenizer" => XLMRobertaTokenizer,
  "MPNetTokenizer" => MPNetTokenizer,
  "CLIPTokenizer" => CLIPTokenizer,
  "GPT2Tokenizer" => GPT2Tokenizer,
  "NllbTokenizer" => NllbTokenizer,
  "M2M100Tokenizer" => M2M100Tokenizer,
  "SpeechT5Tokenizer" => SpeechT5Tokenizer,
  "PreTrainedTokenizer" => PreTrainedTokenizer
}

Class Method Summary collapse

Class Method Details

.from_pretrained(pretrained_model_name_or_path, quantized: true, progress_callback: nil, config: nil, cache_dir: nil, local_files_only: false, revision: "main", legacy: nil, **kwargs) ⇒ Object



268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/informers/tokenizers.rb', line 268

def self.from_pretrained(
  pretrained_model_name_or_path,
  quantized: true,
  progress_callback: nil,
  config: nil,
  cache_dir: nil,
  local_files_only: false,
  revision: "main",
  legacy: nil,
  **kwargs
)
  tokenizer_json, tokenizer_config = load_tokenizer(
    pretrained_model_name_or_path,
    quantized:,
    progress_callback:,
    config:,
    cache_dir:,
    local_files_only:,
    revision:,
    legacy:
  )

  # Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
  tokenizer_name = tokenizer_config["tokenizer_class"]&.delete_suffix("Fast") || "PreTrainedTokenizer"

  cls = TOKENIZER_CLASS_MAPPING[tokenizer_name]
  if !cls
    warn "Unknown tokenizer class #{tokenizer_name.inspect}, attempting to construct from base class."
    cls = PreTrainedTokenizer
  end
  cls.new(tokenizer_json, tokenizer_config)
end

.load_tokenizer(pretrained_model_name_or_path, **options) ⇒ Object



301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/informers/tokenizers.rb', line 301

def self.load_tokenizer(pretrained_model_name_or_path, **options)
  info = [
    Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
    Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
  ]

  # Override legacy option if `options.legacy` is not null
  if !options[:legacy].nil?
    info[1]["legacy"] = options[:legacy]
  end
  info
end