Class: HuggingfaceModel

Inherits:
VectorModel show all
Defined in:
lib/rbbt/vector/model/huggingface.rb,
lib/rbbt/vector/model/huggingface.old.rb

Instance Attribute Summary collapse

Attributes inherited from VectorModel

#bar, #directory, #eval_model, #extract_features, #factor_levels, #features, #labels, #model_file, #model_options, #names, #post_process, #train_model

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from VectorModel

R_eval, R_run, R_train, #__load_method, #add, #add_list, #clear, #cross_validation, #eval, #eval_list, f1_metrics, #run, #save_models, #train

Constructor Details

#initialize(task, initial_checkpoint = nil, *args) ⇒ HuggingfaceModel

Returns a new instance of HuggingfaceModel.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/rbbt/vector/model/huggingface.rb', line 28

def initialize(task, checkpoint, *args)
  options = args.pop if Hash === args.last
  options = Misc.add_defaults options, :task => task, :checkpoint => checkpoint
  super(*args)
  @model_options ||= {}
  @model_options.merge!(options)

  eval_model do |directory,texts|
    checkpoint = directory && File.directory?(directory) ? directory : @model_options[:checkpoint]

    if @model.nil?
      @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, @model_options[:task], checkpoint)
    end
    
    if Array === texts

      if @model_options.include?(:locate_tokens)
        locate_tokens = @model_options[:locate_tokens]
      elsif @model_options[:task] == "MaskedLM"
        @model_options[:locate_tokens] = locate_tokens = @tokenizer.special_tokens_map["mask_token"] 
      end

      if @directory
        tsv_file = File.join(@directory, 'dataset.tsv')
        checkpoint_dir = File.join(@directory, 'checkpoints')
      else
        tmpdir = TmpFile.tmp_file
        Open.mkdir tmpdir
        tsv_file = File.join(tmpdir, 'dataset.tsv')
        checkpoint_dir = File.join(tmpdir, 'checkpoints')
      end

      dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts)
      training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])

      begin
        RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, @model, @tokenizer, training_args_obj, dataset_file, locate_tokens)
      ensure
        Open.rm_rf tmpdir if tmpdir
      end
    else
      RbbtPython.call_method("rbbt_dm.huggingface", :eval_model, @model, @tokenizer, [texts], locate_tokens)
    end
  end

  train_model do |directory,texts,labels|
    checkpoint = directory && File.directory?(directory) ? directory : @model_options[:checkpoint]
    @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, @model_options[:task], checkpoint)

    if @directory
      tsv_file = File.join(@directory, 'dataset.tsv')
      checkpoint_dir = File.join(@directory, 'checkpoints')
    else
      tmpdir = TmpFile.tmp_file
      Open.mkdir tmpdir
      tsv_file = File.join(tmpdir, 'dataset.tsv')
      checkpoint_dir = File.join(tmpdir, 'checkpoints')
    end

    training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
    dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts, labels)

    RbbtPython.call_method("rbbt_dm.huggingface", :train_model, @model, @tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])

    Open.rm_rf tmpdir if tmpdir

    @model.save_pretrained(directory) if directory
    @tokenizer.save_pretrained(directory) if directory
  end

  post_process do |result|
    if result.respond_to?(:predictions)
      single = false
      predictions = result.predictions
    elsif result["token_positions"]
      predictions = result["result"].predictions
      token_positions = result["token_positions"]
    else
      single = true
      predictions = result["logits"]
    end

    task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
    result = case task
             when "SequenceClassification"
               RbbtPython.collect(predictions) do |logits|
                 logits = RbbtPython.numpy2ruby logits
                 best_class = logits.index logits.max
                 best_class = class_labels[best_class] if class_labels
                 best_class
               end
             when "MaskedLM"
               all_token_positions = token_positions.to_a

               i = 0
               RbbtPython.collect(predictions) do |item_logits|
                 item_token_positions = all_token_positions[i]
                 i += 1

                 item_logits = RbbtPython.numpy2ruby(item_logits)
                 item_masks = item_token_positions.collect do |token_positions|

                   best = item_logits.values_at(*token_positions).collect do |logits|
                     best_token, best_score = nil
                     logits.each_with_index do |v,i|
                       if best_score.nil? || v > best_score
                         best_token, best_score = i, v
                       end
                     end
                     best_token
                   end

                   best.collect{|b| @tokenizer.decode(b) } * "|"
                 end
                 Array === locate_tokens ? item_masks : item_masks.first
               end
             else
               logits
             end

    single ? result.first : result
  end


  save_models if @directory
end

Instance Attribute Details

#checkpointObject

Returns the value of attribute checkpoint.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def checkpoint
  @checkpoint
end

#class_labelsObject

Returns the value of attribute class_labels.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def class_labels
  @class_labels
end

#class_weightsObject

Returns the value of attribute class_weights.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def class_weights
  @class_weights
end

#locate_tokensObject

Returns the value of attribute locate_tokens.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def locate_tokens
  @locate_tokens
end

#taskObject

Returns the value of attribute task.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def task
  @task
end

#training_argsObject

Returns the value of attribute training_args.



9
10
11
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 9

def training_args
  @training_args
end

Class Method Details

.call_method(name, *args) ⇒ Object



30
31
32
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 30

def self.call_method(name, *args)
  RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args)
end

.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil) ⇒ Object

def checkpoint_dir

File.join(@directory, 'checkpoints') if @directory

end



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 46

def self.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil)
  TmpFile.with_file do |tmpfile|
    tsv_file = File.join(tmpfile, 'dataset.tsv')

    if training_args
      training_args = training_args.dup
      checkpoint_dir = training_args.delete(:checkpoint_dir)
    end

    checkpoint_dir = File.join(tmpfile, 'checkpoints')

    Open.mkdir File.dirname(tsv_file)
    Open.mkdir File.dirname(checkpoint_dir)

    if labels
      training_args_obj = call_method(:training_args, checkpoint_dir, **training_args)
      call_method(:train_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements, labels), class_weights)
    else
      locate_tokens, training_args = training_args, {}
      if Array === elements
        training_args_obj = call_method(:training_args, checkpoint_dir)
        call_method(:predict_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements), locate_tokens)
      else
        call_method(:eval_model, model, tokenizer, [elements], locate_tokens)
      end
    end
  end
end

.tsv_dataset(tsv_dataset_file, elements, labels = nil) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/vector/model/huggingface.rb', line 9

def self.tsv_dataset(tsv_dataset_file, elements, labels = nil)

  if labels
    Open.write(tsv_dataset_file) do |ffile|
      ffile.puts ["label", "text"].flatten * "\t"
      elements.zip(labels).each do |element,label|
        ffile.puts [label, element].flatten * "\t"
      end
    end
  else
    Open.write(tsv_dataset_file) do |ffile|
      ffile.puts ["text"].flatten * "\t"
      elements.each{|element| ffile.puts element }
    end
  end

  tsv_dataset_file
end

Instance Method Details

#call_method(name, *args) ⇒ Object



34
35
36
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 34

def call_method(name, *args)
  HuggingfaceModel.call_method(name, *args)
end

#init_modelObject



75
76
77
# File 'lib/rbbt/vector/model/huggingface.old.rb', line 75

def init_model
  @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint)
end

#reset_modelObject



155
156
157
158
# File 'lib/rbbt/vector/model/huggingface.rb', line 155

def reset_model
  @model, @tokenizer = nil
  Open.rm @model_file
end