Class: Informers::AutoTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/informers/tokenizers.rb

Constant Summary collapse

TOKENIZER_CLASS_MAPPING =
{
  "T5Tokenizer" => T5Tokenizer,
  "BertTokenizer" => BertTokenizer,
  "DebertaV2Tokenizer" => DebertaV2Tokenizer,
  "DistilBertTokenizer" => DistilBertTokenizer,
  "BartTokenizer" => BartTokenizer,
  "RobertaTokenizer" => RobertaTokenizer,
  "XLMRobertaTokenizer" => XLMRobertaTokenizer,
  "MPNetTokenizer" => MPNetTokenizer,
  "CLIPTokenizer" => CLIPTokenizer,
  "GPT2Tokenizer" => GPT2Tokenizer,
  "NllbTokenizer" => NllbTokenizer,
  "M2M100Tokenizer" => M2M100Tokenizer
}

Class Method Summary collapse

Class Method Details

.from_pretrained(pretrained_model_name_or_path, quantized: true, progress_callback: nil, config: nil, cache_dir: nil, local_files_only: false, revision: "main", legacy: nil, **kwargs) ⇒ Object



263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/informers/tokenizers.rb', line 263

def self.from_pretrained(
  pretrained_model_name_or_path,
  quantized: true,
  progress_callback: nil,
  config: nil,
  cache_dir: nil,
  local_files_only: false,
  revision: "main",
  legacy: nil,
  **kwargs
)
  tokenizer_json, tokenizer_config = load_tokenizer(
    pretrained_model_name_or_path,
    quantized:,
    progress_callback:,
    config:,
    cache_dir:,
    local_files_only:,
    revision:,
    legacy:
  )

  # Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
  tokenizer_name = tokenizer_config["tokenizer_class"]&.delete_suffix("Fast") || "PreTrainedTokenizer"

  cls = TOKENIZER_CLASS_MAPPING[tokenizer_name]
  if !cls
    warn "Unknown tokenizer class #{tokenizer_name.inspect}, attempting to construct from base class."
    cls = PreTrainedTokenizer
  end
  cls.new(tokenizer_json, tokenizer_config)
end

.load_tokenizer(pretrained_model_name_or_path, **options) ⇒ Object



296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/informers/tokenizers.rb', line 296

def self.load_tokenizer(pretrained_model_name_or_path, **options)
  info = [
    Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
    Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
  ]

  # Override legacy option if `options.legacy` is not null
  if !options[:legacy].nil?
    info[1]["legacy"] = options[:legacy]
  end
  info
end