Class: YouTokenToMe::BPE

Inherits:
Object
  • Object
show all
Defined in:
lib/youtokentome/bpe.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(model, n_threads: -1)) ⇒ BPE

Returns a new instance of BPE.



3
4
5
# File 'lib/youtokentome/bpe.rb', line 3

def initialize(model, n_threads: -1)
  @encoder = Ext::BaseEncoder.new(model, n_threads)
end

Class Method Details

.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1,, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3) ⇒ Object



49
50
51
52
# File 'lib/youtokentome/bpe.rb', line 49

def self.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3)
  Ext.train_bpe(data, model, vocab_size, coverage, n_threads, pad_id, unk_id, bos_id, eos_id)
  new(model, n_threads: n_threads)
end

Instance Method Details

#decode(ids) ⇒ Object

TODO add ignore_ids



45
46
47
# File 'lib/youtokentome/bpe.rb', line 45

def decode(ids)
  @encoder.decode(ids)
end

#encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/youtokentome/bpe.rb', line 27

def encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0)
  case output_type
  when :id
    @encoder.encode_as_ids(sentences, bos, eos, reverse, dropout_prob)
  when :subword
    subwords = @encoder.encode_as_subwords(sentences, bos, eos, reverse, dropout_prob)
    subwords.each do |s|
      s.each do |v|
        v.force_encoding(Encoding::UTF_8)
      end
    end
    subwords
  else
    raise ArgumentError, "Unknown output type"
  end
end

#id_to_subword(id) ⇒ Object



23
24
25
# File 'lib/youtokentome/bpe.rb', line 23

def id_to_subword(id)
  @encoder.id_to_subword(id)
end

#subword_to_id(subword) ⇒ Object



19
20
21
# File 'lib/youtokentome/bpe.rb', line 19

def subword_to_id(subword)
  @encoder.subword_to_id(subword)
end

#vocabObject



11
12
13
14
15
16
17
# File 'lib/youtokentome/bpe.rb', line 11

def vocab
  vocab = @encoder.vocab
  vocab.each do |v|
    v.force_encoding(Encoding::UTF_8)
  end
  vocab
end

#vocab_sizeObject



7
8
9
# File 'lib/youtokentome/bpe.rb', line 7

def vocab_size
  @encoder.vocab_size
end