Class: YouTokenToMe::BPE
- Inherits:
-
Object
- Object
- YouTokenToMe::BPE
- Defined in:
- lib/youtokentome/bpe.rb
Class Method Summary collapse
Instance Method Summary collapse
-
#decode(ids) ⇒ Object
TODO add ignore_ids.
- #encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0) ⇒ Object
- #id_to_subword(id) ⇒ Object
-
#initialize(model, n_threads: -1)) ⇒ BPE
constructor
A new instance of BPE.
- #subword_to_id(subword) ⇒ Object
- #vocab ⇒ Object
- #vocab_size ⇒ Object
Constructor Details
#initialize(model, n_threads: -1)) ⇒ BPE
Returns a new instance of BPE.
3 4 5 |
# File 'lib/youtokentome/bpe.rb', line 3 def initialize(model, n_threads: -1) @encoder = Ext::BaseEncoder.new(model, n_threads) end |
Class Method Details
.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1,, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3) ⇒ Object
49 50 51 52 |
# File 'lib/youtokentome/bpe.rb', line 49 def self.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3) Ext.train_bpe(data, model, vocab_size, coverage, n_threads, pad_id, unk_id, bos_id, eos_id) new(model, n_threads: n_threads) end |
Instance Method Details
#decode(ids) ⇒ Object
TODO add ignore_ids
45 46 47 |
# File 'lib/youtokentome/bpe.rb', line 45 def decode(ids) @encoder.decode(ids) end |
#encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/youtokentome/bpe.rb', line 27 def encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0) case output_type when :id @encoder.encode_as_ids(sentences, bos, eos, reverse, dropout_prob) when :subword subwords = @encoder.encode_as_subwords(sentences, bos, eos, reverse, dropout_prob) subwords.each do |s| s.each do |v| v.force_encoding(Encoding::UTF_8) end end subwords else raise ArgumentError, "Unknown output type" end end |
#id_to_subword(id) ⇒ Object
23 24 25 |
# File 'lib/youtokentome/bpe.rb', line 23 def id_to_subword(id) @encoder.id_to_subword(id) end |
#subword_to_id(subword) ⇒ Object
19 20 21 |
# File 'lib/youtokentome/bpe.rb', line 19 def subword_to_id(subword) @encoder.subword_to_id(subword) end |
#vocab ⇒ Object
11 12 13 14 15 16 17 |
# File 'lib/youtokentome/bpe.rb', line 11 def vocab vocab = @encoder.vocab vocab.each do |v| v.force_encoding(Encoding::UTF_8) end vocab end |
#vocab_size ⇒ Object
7 8 9 |
# File 'lib/youtokentome/bpe.rb', line 7 def vocab_size @encoder.vocab_size end |