Class: HuggingfaceModel
- Inherits:
-
VectorModel
- Object
- VectorModel
- HuggingfaceModel
- Defined in:
- lib/rbbt/vector/model/huggingface.rb,
lib/rbbt/vector/model/huggingface.old.rb
Instance Attribute Summary collapse
-
#checkpoint ⇒ Object
Returns the value of attribute checkpoint.
-
#class_labels ⇒ Object
Returns the value of attribute class_labels.
-
#class_weights ⇒ Object
Returns the value of attribute class_weights.
-
#locate_tokens ⇒ Object
Returns the value of attribute locate_tokens.
-
#task ⇒ Object
Returns the value of attribute task.
-
#training_args ⇒ Object
Returns the value of attribute training_args.
Attributes inherited from VectorModel
#bar, #directory, #eval_model, #extract_features, #factor_levels, #features, #labels, #model_file, #model_options, #names, #post_process, #train_model
Class Method Summary collapse
- .call_method(name, *args) ⇒ Object
-
.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil) ⇒ Object
def checkpoint_dir File.join(@directory, ‘checkpoints’) if @directory end.
- .tsv_dataset(tsv_dataset_file, elements, labels = nil) ⇒ Object
Instance Method Summary collapse
- #call_method(name, *args) ⇒ Object
- #init_model ⇒ Object
-
#initialize(task, initial_checkpoint = nil, *args) ⇒ HuggingfaceModel
constructor
A new instance of HuggingfaceModel.
- #reset_model ⇒ Object
Methods inherited from VectorModel
R_eval, R_run, R_train, #__load_method, #add, #add_list, #clear, #cross_validation, #eval, #eval_list, f1_metrics, #run, #save_models, #train
Constructor Details
#initialize(task, initial_checkpoint = nil, *args) ⇒ HuggingfaceModel
Returns a new instance of HuggingfaceModel.
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 28 def initialize(task, checkpoint, *args) = args.pop if Hash === args.last = Misc.add_defaults , :task => task, :checkpoint => checkpoint super(*args) ||= {} .merge!() eval_model do |directory,texts| checkpoint = directory && File.directory?(directory) ? directory : [:checkpoint] if @model.nil? @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, [:task], checkpoint) end if Array === texts if .include?(:locate_tokens) locate_tokens = [:locate_tokens] elsif [:task] == "MaskedLM" [:locate_tokens] = locate_tokens = @tokenizer.special_tokens_map["mask_token"] end if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts) training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, [:training_args]) begin RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, @model, @tokenizer, training_args_obj, dataset_file, locate_tokens) ensure Open.rm_rf tmpdir if tmpdir end else RbbtPython.call_method("rbbt_dm.huggingface", :eval_model, @model, @tokenizer, [texts], locate_tokens) end end train_model do |directory,texts,labels| checkpoint = directory && File.directory?(directory) ? directory : [:checkpoint] @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, [:task], checkpoint) if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, [:training_args]) dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts, labels) RbbtPython.call_method("rbbt_dm.huggingface", :train_model, @model, @tokenizer, training_args_obj, dataset_file, [:class_weights]) Open.rm_rf tmpdir if tmpdir @model.save_pretrained(directory) if directory @tokenizer.save_pretrained(directory) if directory end post_process do |result| if result.respond_to?(:predictions) single = false predictions = result.predictions elsif result["token_positions"] predictions = result["result"].predictions token_positions = result["token_positions"] else single = true predictions = result["logits"] end task, class_labels, locate_tokens = .values_at :task, :class_labels, :locate_tokens result = case task when "SequenceClassification" RbbtPython.collect(predictions) do |logits| logits = RbbtPython.numpy2ruby logits best_class = logits.index logits.max best_class = class_labels[best_class] if class_labels best_class end when "MaskedLM" all_token_positions = token_positions.to_a i = 0 RbbtPython.collect(predictions) do |item_logits| item_token_positions = all_token_positions[i] i += 1 item_logits = RbbtPython.numpy2ruby(item_logits) item_masks = item_token_positions.collect do |token_positions| best = item_logits.values_at(*token_positions).collect do |logits| best_token, best_score = nil logits.each_with_index do |v,i| if best_score.nil? || v > best_score best_token, best_score = i, v end end best_token end best.collect{|b| @tokenizer.decode(b) } * "|" end Array === locate_tokens ? item_masks : item_masks.first end else logits end single ? result.first : result end save_models if @directory end |
Instance Attribute Details
#checkpoint ⇒ Object
Returns the value of attribute checkpoint.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def checkpoint @checkpoint end |
#class_labels ⇒ Object
Returns the value of attribute class_labels.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def class_labels @class_labels end |
#class_weights ⇒ Object
Returns the value of attribute class_weights.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def class_weights @class_weights end |
#locate_tokens ⇒ Object
Returns the value of attribute locate_tokens.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def locate_tokens @locate_tokens end |
#task ⇒ Object
Returns the value of attribute task.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def task @task end |
#training_args ⇒ Object
Returns the value of attribute training_args.
9 10 11 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9 def training_args @training_args end |
Class Method Details
.call_method(name, *args) ⇒ Object
30 31 32 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 30 def self.call_method(name, *args) RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args) end |
.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil) ⇒ Object
def checkpoint_dir
File.join(@directory, 'checkpoints') if @directory
end
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 46 def self.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil) TmpFile.with_file do |tmpfile| tsv_file = File.join(tmpfile, 'dataset.tsv') if training_args training_args = training_args.dup checkpoint_dir = training_args.delete(:checkpoint_dir) end checkpoint_dir = File.join(tmpfile, 'checkpoints') Open.mkdir File.dirname(tsv_file) Open.mkdir File.dirname(checkpoint_dir) if labels training_args_obj = call_method(:training_args, checkpoint_dir, **training_args) call_method(:train_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements, labels), class_weights) else locate_tokens, training_args = training_args, {} if Array === elements training_args_obj = call_method(:training_args, checkpoint_dir) call_method(:predict_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements), locate_tokens) else call_method(:eval_model, model, tokenizer, [elements], locate_tokens) end end end end |
.tsv_dataset(tsv_dataset_file, elements, labels = nil) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 9 def self.tsv_dataset(tsv_dataset_file, elements, labels = nil) if labels Open.write(tsv_dataset_file) do |ffile| ffile.puts ["label", "text"].flatten * "\t" elements.zip(labels).each do |element,label| ffile.puts [label, element].flatten * "\t" end end else Open.write(tsv_dataset_file) do |ffile| ffile.puts ["text"].flatten * "\t" elements.each{|element| ffile.puts element } end end tsv_dataset_file end |
Instance Method Details
#call_method(name, *args) ⇒ Object
34 35 36 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 34 def call_method(name, *args) HuggingfaceModel.call_method(name, *args) end |
#init_model ⇒ Object
75 76 77 |
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 75 def init_model @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint) end |
#reset_model ⇒ Object
155 156 157 158 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 155 def reset_model @model, @tokenizer = nil Open.rm @model_file end |