Class: Informers::PreTrainedTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/informers/tokenizers.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(tokenizer_json, tokenizer_config) ⇒ PreTrainedTokenizer

Returns a new instance of PreTrainedTokenizer.



5
6
7
8
9
10
11
12
13
14
# File 'lib/informers/tokenizers.rb', line 5

def initialize(tokenizer_json, tokenizer_config)
  super()

  @tokenizer = Tokenizers::Tokenizer.from_file(tokenizer_json)

  @sep_token = tokenizer_config["sep_token"]
  @sep_token_id = @tokenizer.token_to_id(@sep_token)

  @model_max_length = tokenizer_config["model_max_length"]
end

Instance Attribute Details

#sep_token_idObject (readonly)

Returns the value of attribute sep_token_id.



3
4
5
# File 'lib/informers/tokenizers.rb', line 3

def sep_token_id
  @sep_token_id
end

Instance Method Details

#call(text, text_pair: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, return_tensor: true, return_token_type_ids: true, return_offsets: false) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/informers/tokenizers.rb', line 16

def call(
  text,
  text_pair: nil,
  add_special_tokens: true,
  padding: false,
  truncation: nil,
  max_length: nil,
  return_tensor: true,
  return_token_type_ids: true, # TODO change default
  return_offsets: false
)
  is_batched = text.is_a?(Array)

  if is_batched
    if text.length == 0
      raise Error, "text array must be non-empty"
    end

    if !text_pair.nil?
      if !text_pair.is_a?(Array)
        raise Error, "text_pair must also be an array"
      elsif text.length != text_pair.length
        raise Error, "text and text_pair must have the same length"
      end
    end
  end

  if padding
    @tokenizer.enable_padding
  else
    @tokenizer.no_padding
  end

  if truncation
    @tokenizer.enable_truncation(max_length || @model_max_length)
  else
    @tokenizer.no_truncation
  end

  if is_batched
    input = text_pair ? text.zip(text_pair) : text
    encoded = @tokenizer.encode_batch(input, add_special_tokens: add_special_tokens)
  else
    encoded = [@tokenizer.encode(text, text_pair, add_special_tokens: add_special_tokens)]
  end

  result = {input_ids: encoded.map(&:ids), attention_mask: encoded.map(&:attention_mask)}
  if return_token_type_ids
    result[:token_type_ids] = encoded.map(&:type_ids)
  end
  if return_offsets
    result[:offsets] = encoded.map(&:offsets)
  end
  result
end

#convert_tokens_to_string(tokens) ⇒ Object



76
77
78
# File 'lib/informers/tokenizers.rb', line 76

def convert_tokens_to_string(tokens)
  @tokenizer.decoder.decode(tokens)
end

#decode(tokens, skip_special_tokens:) ⇒ Object



72
73
74
# File 'lib/informers/tokenizers.rb', line 72

def decode(tokens, skip_special_tokens:)
  @tokenizer.decode(tokens, skip_special_tokens: skip_special_tokens)
end