Class: Transformers::ConvertSlowTokenizer::BertConverter

Inherits:
Converter
  • Object
show all
Defined in:
lib/transformers/convert_slow_tokenizer.rb

Instance Method Summary collapse

Methods inherited from Converter

#initialize

Constructor Details

This class inherits a constructor from Transformers::ConvertSlowTokenizer::Converter

Instance Method Details

#convertedObject



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/transformers/convert_slow_tokenizer.rb', line 28

def converted
  vocab = @original_tokenizer.vocab
  tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::WordPiece.new(vocab: vocab, unk_token: @original_tokenizer.unk_token.to_s))

  tokenize_chinese_chars = false
  strip_accents = false
  do_lower_case = false
  if @original_tokenizer.basic_tokenizer
    tokenize_chinese_chars = @original_tokenizer.basic_tokenizer.tokenize_chinese_chars
    strip_accents = @original_tokenizer.basic_tokenizer.strip_accents
    do_lower_case = @original_tokenizer.basic_tokenizer.do_lower_case
  end

  tokenizer.normalizer =
    Tokenizers::Normalizers::BertNormalizer.new(
      clean_text: true,
      handle_chinese_chars: tokenize_chinese_chars,
      strip_accents: strip_accents,
      lowercase: do_lower_case,
    )
  tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::BertPreTokenizer.new

  cls = @original_tokenizer.cls_token.to_s
  sep = @original_tokenizer.sep_token.to_s
  cls_token_id = @original_tokenizer.cls_token_id
  sep_token_id = @original_tokenizer.sep_token_id

  tokenizer.post_processor =
    Tokenizers::Processors::TemplateProcessing.new(
      single: "#{cls}:0 $A:0 #{sep}:0",
      pair: "#{cls}:0 $A:0 #{sep}:0 $B:1 #{sep}:1",
      special_tokens: [
        [cls, cls_token_id],
        [sep, sep_token_id]
      ]
    )
  tokenizer.decoder = Tokenizers::Decoders::WordPiece.new(prefix: "##")

  tokenizer
end