Class: Informers::PreTrainedTokenizer

Inherits:

Object

Object
Informers::PreTrainedTokenizer

show all

Defined in:: lib/informers/tokenizers.rb

Direct Known Subclasses

BertTokenizer, DebertaV2Tokenizer, DistilBertTokenizer, MPNetTokenizer, RobertaTokenizer, XLMRobertaTokenizer

Instance Attribute Summary collapse

#sep_token_id ⇒ Object readonly

Returns the value of attribute sep_token_id.

Instance Method Summary collapse

#call(text, text_pair: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, return_tensor: true, return_token_type_ids: true, return_offsets: false) ⇒ Object
#convert_tokens_to_string(tokens) ⇒ Object
#decode(tokens, skip_special_tokens:) ⇒ Object
#initialize(tokenizer_json, tokenizer_config) ⇒ PreTrainedTokenizer constructor

A new instance of PreTrainedTokenizer.

Constructor Details

#initialize(tokenizer_json, tokenizer_config) ⇒ `PreTrainedTokenizer`

Returns a new instance of PreTrainedTokenizer.

# File 'lib/informers/tokenizers.rb', line 5

def initialize(tokenizer_json, tokenizer_config)
  super()

  @tokenizer = Tokenizers::Tokenizer.from_file(tokenizer_json)

  @sep_token = tokenizer_config["sep_token"]
  @sep_token_id = @tokenizer.token_to_id(@sep_token)

  @model_max_length = tokenizer_config["model_max_length"]
end

Instance Attribute Details

#sep_token_id ⇒ `Object` (readonly)

Returns the value of attribute sep_token_id.



3
4
5

# File 'lib/informers/tokenizers.rb', line 3

def sep_token_id
  @sep_token_id
end

Instance Method Details

#call(text, text_pair: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, return_tensor: true, return_token_type_ids: true, return_offsets: false) ⇒ `Object`

# File 'lib/informers/tokenizers.rb', line 16

def call(
  text,
  text_pair: nil,
  add_special_tokens: true,
  padding: false,
  truncation: nil,
  max_length: nil,
  return_tensor: true,
  return_token_type_ids: true, # TODO change default
  return_offsets: false
)
  is_batched = text.is_a?(Array)

  if is_batched
    if text.length == 0
      raise Error, "text array must be non-empty"
    end

    if !text_pair.nil?
      if !text_pair.is_a?(Array)
        raise Error, "text_pair must also be an array"
      elsif text.length != text_pair.length
        raise Error, "text and text_pair must have the same length"
      end
    end
  end

  if padding
    @tokenizer.enable_padding
  else
    @tokenizer.no_padding
  end

  if truncation
    @tokenizer.enable_truncation(max_length || @model_max_length)
  else
    @tokenizer.no_truncation
  end

  if is_batched
    input = text_pair ? text.zip(text_pair) : text
    encoded = @tokenizer.encode_batch(input, add_special_tokens: add_special_tokens)
  else
    encoded = [@tokenizer.encode(text, text_pair, add_special_tokens: add_special_tokens)]
  end

  result = {input_ids: encoded.map(&:ids), attention_mask: encoded.map(&:attention_mask)}
  if return_token_type_ids
    result[:token_type_ids] = encoded.map(&:type_ids)
  end
  if return_offsets
    result[:offsets] = encoded.map(&:offsets)
  end
  result
end

#convert_tokens_to_string(tokens) ⇒ `Object`



76
77
78

# File 'lib/informers/tokenizers.rb', line 76

def convert_tokens_to_string(tokens)
  @tokenizer.decoder.decode(tokens)
end

#decode(tokens, skip_special_tokens:) ⇒ `Object`



72
73
74

# File 'lib/informers/tokenizers.rb', line 72

def decode(tokens, skip_special_tokens:)
  @tokenizer.decode(tokens, skip_special_tokens: skip_special_tokens)
end

Class: Informers::PreTrainedTokenizer

Direct Known Subclasses

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(tokenizer_json, tokenizer_config) ⇒ PreTrainedTokenizer

Instance Attribute Details

#sep_token_id ⇒ Object (readonly)

Instance Method Details

#call(text, text_pair: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, return_tensor: true, return_token_type_ids: true, return_offsets: false) ⇒ Object

#convert_tokens_to_string(tokens) ⇒ Object

#decode(tokens, skip_special_tokens:) ⇒ Object

#initialize(tokenizer_json, tokenizer_config) ⇒ `PreTrainedTokenizer`

#sep_token_id ⇒ `Object` (readonly)

#call(text, text_pair: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, return_tensor: true, return_token_type_ids: true, return_offsets: false) ⇒ `Object`

#convert_tokens_to_string(tokens) ⇒ `Object`

#decode(tokens, skip_special_tokens:) ⇒ `Object`