Class: HuggingfaceModel
- Inherits:
- 
      TorchModel
      
        - Object
- VectorModel
- PythonModel
- TorchModel
- HuggingfaceModel
 
- Defined in:
- lib/rbbt/vector/model/huggingface.rb
Direct Known Subclasses
Instance Attribute Summary collapse
- 
  
    
      #tokenizer  ⇒ Object 
    
    
  
  
  
  
    
    
  
  
  
  
  
  
    Returns the value of attribute tokenizer. 
Attributes inherited from TorchModel
#criterion, #optimizer, #training_args
Attributes inherited from PythonModel
Attributes inherited from VectorModel
#balance, #bar, #directory, #eval_model, #extract_features, #factor_levels, #features, #init_model, #labels, #model, #model_options, #model_path, #names, #post_process, #train_model
Instance Method Summary collapse
- #init ⇒ Object
- 
  
    
      #initialize(task, checkpoint, dir = nil, model_options = {})  ⇒ HuggingfaceModel 
    
    
  
  
  
    constructor
  
  
  
  
  
  
  
    A new instance of HuggingfaceModel. 
- #reset_model ⇒ Object
Methods inherited from TorchModel
device, dtype, feature_dataset, feature_tsv, freeze, #freeze_layer, freeze_layer, get_layer, #get_layer, #get_weights, get_weights, init_python, load_architecture, load_state, model_architecture, optimizer, save_architecture, save_state, tensor, text_dataset
Methods inherited from VectorModel
R_eval, R_run, R_train, #__load_method, #add, #add_list, #balance_labels, #clear, #cross_validation, #eval, #eval_list, f1_metrics, #run, #save_models, #train
Constructor Details
#initialize(task, checkpoint, dir = nil, model_options = {}) ⇒ HuggingfaceModel
Returns a new instance of HuggingfaceModel.
| 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | # File 'lib/rbbt/vector/model/huggingface.rb', line 16 def initialize(task, checkpoint, dir = nil, = {}) super(dir, nil, ) checkpoint = checkpoint.find if Path === checkpoint @model_options = Misc.add_defaults @model_options, :task => task, :checkpoint => checkpoint init_model do checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint] model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model, @model_options[:task], checkpoint, **(IndiferentHash.setup([:model_args]) || {})) tokenizer_checkpoint = @model_options[:tokenizer_checkpoint] || checkpoint tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer, @model_options[:task], tokenizer_checkpoint, **(IndiferentHash.setup([:tokenizer_args]) || {})) [model, tokenizer] end eval_model do |texts,is_list| model, tokenizer = self.init if is_list || @model_options[:task] == "MaskedLM" texts = [texts] if ! is_list if @model_options.include?(:locate_tokens) locate_tokens = @model_options[:locate_tokens] elsif @model_options[:task] == "MaskedLM" @model_options[:locate_tokens] = locate_tokens = tokenizer.special_tokens_map["mask_token"] end if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end dataset_file = TorchModel.text_dataset(tsv_file, texts) training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args]) begin RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens) ensure Open.rm_rf tmpdir if tmpdir end else RbbtPython.call_method("rbbt_dm.huggingface", :eval_model, model, tokenizer, [texts], locate_tokens) end end train_model do |texts,labels| model, tokenizer = self.init if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args]) dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels]) RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights]) Open.rm_rf tmpdir if tmpdir model.save_pretrained(@model_path) if @model_path tokenizer.save_pretrained(@model_path) if @model_path end post_process do |result,is_list| model, tokenizer = self.init if result.respond_to?(:predictions) single = false predictions = result.predictions elsif result["token_positions"] predictions = result["result"].predictions token_positions = result["token_positions"] else single = true predictions = result["logits"] end task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens result = case task when "SequenceClassification" RbbtPython.collect(predictions) do |logits| logits = RbbtPython.numpy2ruby logits best_class = logits.index logits.max best_class = class_labels[best_class] if class_labels best_class end when "MaskedLM" all_token_positions = token_positions.to_a i = 0 RbbtPython.collect(predictions) do |item_logits| item_token_positions = all_token_positions[i] i += 1 item_logits = RbbtPython.numpy2ruby(item_logits) item_masks = item_token_positions.collect do |token_positions| best = item_logits.values_at(*token_positions).collect do |logits| best_token, best_score = nil logits.each_with_index do |v,i| if best_score.nil? || v > best_score best_token, best_score = i, v end end best_token end best.collect{|b| tokenizer.decode(b) } * "|" end Array === locate_tokens ? item_masks : item_masks.first end else predictions end (! is_list || single) && Array === result ? result.first : result end save_models if @model_path end | 
Instance Attribute Details
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
| 5 6 7 | # File 'lib/rbbt/vector/model/huggingface.rb', line 5 def tokenizer @tokenizer end | 
Instance Method Details
#init ⇒ Object
| 6 7 8 9 | # File 'lib/rbbt/vector/model/huggingface.rb', line 6 def init @model, @tokenizer = self.instance_exec(&@init_model) if @model.nil? [@model, @tokenizer] end | 
#reset_model ⇒ Object
| 155 156 157 158 159 | # File 'lib/rbbt/vector/model/huggingface.rb', line 155 def reset_model @model, @tokenizer = nil Open.rm_rf @model_path init end |