Class: Transformers::PreTrainedTokenizerFast
- Inherits:
-
PreTrainedTokenizerBase
- Object
- PreTrainedTokenizerBase
- Transformers::PreTrainedTokenizerFast
- Defined in:
- lib/transformers/tokenization_utils_fast.rb
Direct Known Subclasses
Bert::BertTokenizerFast, Distilbert::DistilBertTokenizerFast
Constant Summary
Constants included from SpecialTokensMixin
SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES
Instance Attribute Summary
Attributes inherited from PreTrainedTokenizerBase
#init_kwargs, #model_max_length
Instance Method Summary collapse
- #_convert_token_to_id_with_added_voc(token) ⇒ Object
- #convert_ids_to_tokens(ids, skip_special_tokens: false) ⇒ Object
- #convert_tokens_to_ids(tokens) ⇒ Object
- #get_vocab ⇒ Object
-
#initialize(*args, **kwargs) ⇒ PreTrainedTokenizerFast
constructor
A new instance of PreTrainedTokenizerFast.
- #is_fast ⇒ Object
- #vocab ⇒ Object
Methods inherited from PreTrainedTokenizerBase
#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained
Methods included from ClassAttribute
Methods included from SpecialTokensMixin
#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id
Constructor Details
#initialize(*args, **kwargs) ⇒ PreTrainedTokenizerFast
Returns a new instance of PreTrainedTokenizerFast.
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 17 def initialize(*args, **kwargs) tokenizer_object = kwargs.delete(:tokenizer_object) slow_tokenizer = kwargs.delete(:__slow_tokenizer) fast_tokenizer_file = kwargs.delete(:tokenizer_file) from_slow = kwargs.delete(:from_slow) { false } _added_tokens_decoder = kwargs.delete(:added_tokens_decoder) if !tokenizer_object.nil? fast_tokenizer = Copy.deepcopy(tokenizer_object) elsif !fast_tokenizer_file.nil? && !from_slow # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = Tokenizers::Tokenizer.from_file(fast_tokenizer_file) elsif !slow_tokenizer.nil? # We need to convert a slow tokenizer to build the backend fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer) elsif !@slow_tokenizer_class.nil? # We need to create and convert a slow tokenizer to build the backend slow_tokenizer = @slow_tokenizer_class.new(*args, **kwargs) fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer) else raise ArgumentError, " Couldn't instantiate the backend tokenizer from one of:\n (1) a `tokenizers` library serialization file,\n (2) a slow tokenizer instance to convert or\n (3) an equivalent slow tokenizer class to instantiate and convert.\n You need to have sentencepiece installed to convert a slow tokenizer to a fast one.\n MSG\n end\n\n @tokenizer = fast_tokenizer\n\n if !slow_tokenizer.nil?\n kwargs.merge!(slow_tokenizer.init_kwargs)\n end\n\n @decode_use_source_tokenizer = false\n\n _truncation = @tokenizer.truncation\n\n if !_truncation.nil?\n _truncation = _truncation.transform_keys(&:to_sym)\n @tokenizer.enable_truncation(_truncation[:max_length], **_truncation.except(:max_length))\n kwargs[:max_length] ||= _truncation[:max_length]\n kwargs[:truncation_side] ||= _truncation[:direction]\n kwargs[:stride] ||= _truncation[:stride]\n kwargs[:truncation_strategy] ||= _truncation[:strategy]\n else\n @tokenizer.no_truncation\n end\n\n _padding = @tokenizer.padding\n if !_padding.nil?\n _padding = _padding.transform_keys(&:to_sym)\n @tokenizer.enable_padding(**_padding)\n kwargs[:pad_token] ||= _padding[:pad_token]\n kwargs[:pad_token_type_id] ||= _padding[:pad_token_type_id]\n kwargs[:padding_side] ||= _padding[:direction]\n kwargs[:max_length] ||= _padding[:length]\n kwargs[:pad_to_multiple_of] ||= _padding[:pad_to_multiple_of]\n end\n\n # We call this after having initialized the backend tokenizer because we update it.\n super(**kwargs)\nend\n" |
Instance Method Details
#_convert_token_to_id_with_added_voc(token) ⇒ Object
110 111 112 113 114 115 116 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 110 def _convert_token_to_id_with_added_voc(token) index = @tokenizer.token_to_id(token) if index.nil? return unk_token_id end index end |
#convert_ids_to_tokens(ids, skip_special_tokens: false) ⇒ Object
118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 118 def convert_ids_to_tokens(ids, skip_special_tokens: false) if ids.is_a?(Integer) return @tokenizer.id_to_token(ids) end tokens = [] ids.each do |index| index = index.to_i if skip_special_tokens && @all_special_ids.include?(index) next end tokens << @tokenizer.id_to_token(index) end tokens end |
#convert_tokens_to_ids(tokens) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 94 def convert_tokens_to_ids(tokens) if tokens.nil? return nil end if tokens.is_a?(String) return _convert_token_to_id_with_added_voc(tokens) end ids = [] tokens.each do |token| ids << _convert_token_to_id_with_added_voc(token) end ids end |
#get_vocab ⇒ Object
86 87 88 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 86 def get_vocab @tokenizer.vocab(with_added_tokens: true) end |
#is_fast ⇒ Object
82 83 84 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 82 def is_fast true end |
#vocab ⇒ Object
90 91 92 |
# File 'lib/transformers/tokenization_utils_fast.rb', line 90 def vocab get_vocab end |