Class: Tokenizers::CharBPETokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/tokenizers/char_bpe_tokenizer.rb

Instance Method Summary collapse

Constructor Details

#initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>") ⇒ CharBPETokenizer

Returns a new instance of CharBPETokenizer.



3
4
5
6
7
8
9
10
11
12
# File 'lib/tokenizers/char_bpe_tokenizer.rb', line 3

def initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>")
  @tokenizer =
    Tokenizer.new(
      Models::BPE._from_file(vocab, merges, {unk_token: unk_token, end_of_word_suffix: suffix})
    )
  @tokenizer.add_special_tokens([unk_token])
  @tokenizer.normalizer = Normalizers::BertNormalizer.new
  @tokenizer.pre_tokenizer = PreTokenizers::BertPreTokenizer.new
  @tokenizer.decoder = Decoders::BPEDecoder.new
end

Instance Method Details

#decode(ids) ⇒ Object



18
19
20
# File 'lib/tokenizers/char_bpe_tokenizer.rb', line 18

def decode(ids)
  @tokenizer.decode(ids)
end

#encode(text, **options) ⇒ Object



14
15
16
# File 'lib/tokenizers/char_bpe_tokenizer.rb', line 14

def encode(text, **options)
  @tokenizer.encode(text, **options)
end