Class: Cnvrg::Helpers::Executer

Inherits:
Object
  • Object
show all
Defined in:
lib/cnvrg/helpers/executer.rb

Constant Summary collapse

MAIN_CONTAINER_PORT =
ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
HAS_DOCKER =
ENV['HAS_DOCKER'] == "true"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(owner: nil, machine_activity: nil, poll_every: 30, job_id: nil) ⇒ Executer

this class represent a machine_activity. it will poll the commands, communicate with the server (poll commands) and let the server know the status of this executer.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/cnvrg/helpers/executer.rb', line 10

def initialize(owner: nil, machine_activity: nil, poll_every: 30, job_id: nil)
  @owner = owner
  @job_id = job_id
  @poll_every = poll_every
  @check_main_every = 10
  @machine_activity = machine_activity
  @commands_q = Queue.new
  @files_q = Queue.new
  @agent_id = nil
  @main_id = nil
  @main_start_time = nil
  @is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
  @main_name = @is_new_main ? "main" : "slave"
end

Instance Attribute Details

#agent_idObject (readonly)

Returns the value of attribute agent_id.



4
5
6
# File 'lib/cnvrg/helpers/executer.rb', line 4

def agent_id
  @agent_id
end

#machine_activityObject (readonly)

Returns the value of attribute machine_activity.



4
5
6
# File 'lib/cnvrg/helpers/executer.rb', line 4

def machine_activity
  @machine_activity
end

#main_idObject (readonly)

Returns the value of attribute main_id.



4
5
6
# File 'lib/cnvrg/helpers/executer.rb', line 4

def main_id
  @main_id
end

Class Method Details

.get_main_conn(timeout: 4, open_timeout: 1) ⇒ Object



383
384
385
386
387
388
389
390
391
# File 'lib/cnvrg/helpers/executer.rb', line 383

def self.get_main_conn(timeout: 4, open_timeout: 1)
  conn = Faraday.new(
    url: Cnvrg::Helpers::Executer.main_container_url,
    headers: {'Content-Type' => 'application/json'}
  )
  conn.options.timeout = timeout
  conn.options.open_timeout = open_timeout
  conn
end

.main_container_urlObject



370
371
372
373
374
375
376
377
378
379
380
381
# File 'lib/cnvrg/helpers/executer.rb', line 370

def self.main_container_url
  if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
    if ENV["MAIN_CONTAINER_PORT"].blank?
      host = "slave"
    else
      host = "main"
    end
    "http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
  else
    "http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
  end
end

Instance Method Details

#activity_urlObject



39
40
41
# File 'lib/cnvrg/helpers/executer.rb', line 39

def activity_url
  ['users', @owner, 'machine_activities', @machine_activity].join("/")
end

#check_main_aliveObject



343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/cnvrg/helpers/executer.rb', line 343

def check_main_alive
  # Dont check before we got first response
  return if @main_start_time == nil
  conn = Cnvrg::Helpers::Executer.get_main_conn
  response = conn.get('readiness')
  if response.to_hash[:status].to_i != 200
    main_start_time = 0
  else
    main_start_time = response.body.to_i
  end
  if main_start_time != @main_start_time
    puts("Found that main restarted, restarting agent")
    Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
    exit(1)
  end
end

#check_main_is_working_threadObject



201
202
203
204
205
206
# File 'lib/cnvrg/helpers/executer.rb', line 201

def check_main_is_working_thread
  while true
    check_main_alive
    sleep(@check_main_every)
  end
end

#containersObject



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/cnvrg/helpers/executer.rb', line 83

def containers
  agent_id = nil
  main_id = nil
  timeout = 2
  timeout = nil if (!@is_new_main || HAS_DOCKER)
  Timeout.timeout(timeout) do
    while agent_id.blank? or main_id.blank?
      grep_by = @job_id
      grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
      cntrs = `docker ps --format "table {{.ID}},{{.Names}}" 2> /dev/null | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
      agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
      main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
      sleep(2)
    end
  end
  if main_id.blank?
    raise "Can't find main id"
  end
  [agent_id, main_id]
rescue => e
  Cnvrg::Logger.log_error(e)
  [agent_id, main_id]
end

#copy_file_to_mainObject



252
253
254
255
256
257
258
259
260
# File 'lib/cnvrg/helpers/executer.rb', line 252

def copy_file_to_main
  begin
    FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
    FileUtils.cp_r("/scripts", "/conf/scripts-bin")
    FileUtils.touch("/conf/tiny-ready")
  rescue => e
    Cnvrg::Logger.log_error(e)
  end
end

#create_file_cmd(path, content) ⇒ Object



25
26
27
28
29
30
31
# File 'lib/cnvrg/helpers/executer.rb', line 25

def create_file_cmd(path, content)
  if path.include? "~"
    path = File.expand_path(path)
  end
  FileUtils.mkdir_p(File.dirname(path))
  File.open(path, "w+"){|f| f.write(content)}
end

#current_homedirObject



107
108
109
# File 'lib/cnvrg/helpers/executer.rb', line 107

def current_homedir
  `env | grep -w HOME`.strip.split("=").try(:last)
end

#execute_cmdsObject



274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# File 'lib/cnvrg/helpers/executer.rb', line 274

def execute_cmds
  pids_by_slug = {}
  while true
    if @commands_q.empty?
      sleep(5)
      next
    end
    cmd = @commands_q.pop.symbolize_keys

    if cmd[:wait_slug].present?
      if pids_by_slug[cmd[:wait_slug]].present?
        other_pid = pids_by_slug[cmd[:wait_slug]]
        begin
          Process.waitpid(other_pid, Process::WNOHANG)
          running = true
        rescue Errno::ECHILD => e
          running = false
        end
        if running
          @commands_q.push(cmd)
          sleep(5)
          next
        end
      end
    end
    command_json = Cnvrg::API.request([activity_url, "commands", cmd[:slug]].join('/'), "GET")

    cmd_status = command_json["status"] rescue ""

    if cmd_status == Cnvrg::Helpers::Agent::Status::ABORTED
      Cnvrg::Logger.log_info("stopping job because command #{cmd[:slug]} with status #{cmd_status}")
      next
    end
    pid = Process.fork do
      Cnvrg::Helpers::Agent.new(executer: self, **cmd).exec!
    end
    if cmd[:async].blank?
      Process.waitpid(pid)
    else
      Process.detach(pid)
    end
    pids_by_slug[cmd[:slug]] = pid
    ######
  end
end

#executer_statsObject



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/cnvrg/helpers/executer.rb', line 43

def executer_stats
  return @stats if @stats.present?
  Cnvrg::Logger.log_info("getting containers")
  @agent_id, @main_id = containers
  Cnvrg::Logger.log_info("got containers")
  pod_name, node_name = get_node_and_pod_names
  # For backwards compatibility we still call this slave stats
  @stats = {
      pod_name: pod_name,
      node_name: node_name,
      agent: {
        container_id: @agent_id,
        workdir: `pwd`.strip,
        homedir: current_homedir,
        user: `whoami`.strip,
        user_id: `id -u`.strip,
        group_id: `id -g`.strip,
        cnvrg: Cnvrg::VERSION
      },
      slave: {
          container_id: @main_id,
          container_name: @main_name,
          workdir: run_in_main('pwd'),
          homedir: main_homedir,
          spark_path: spark_path,
          user: run_in_main( 'whoami'),
          cnvrg: run_in_main( 'which cnvrg'),
          has_bash: run_in_main( 'which bash'),
          user_id: run_in_main( 'id -u'),
          group_id: run_in_main( 'id -g'),
          python_version: run_in_main( 'python --version'),
          python3_version: run_in_main( 'python3 --version'),
          pip_version: run_in_main( 'pip --version'),
          pip3_version: run_in_main( 'pip3 --version')
      },
  }

  @stats
end

#get_node_and_pod_namesObject



325
326
327
328
329
330
331
332
333
334
# File 'lib/cnvrg/helpers/executer.rb', line 325

def get_node_and_pod_names
  pod_name = `hostname`.strip rescue nil
  node_name = nil
  if pod_name.present?
    pod_describe = `kubectl get pod #{pod_name} -o json 2> /dev/null` rescue nil
    pod_describe = JSON.parse(pod_describe) rescue {}
    node_name = pod_describe["spec"]["nodeName"] rescue nil
  end
  [pod_name, node_name]
end

#get_node_events(node_name) ⇒ Object



365
366
367
368
# File 'lib/cnvrg/helpers/executer.rb', line 365

def get_node_events(node_name)
  return if node_name.blank?
  `kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
end

#get_pod_events(pod_name) ⇒ Object



360
361
362
363
# File 'lib/cnvrg/helpers/executer.rb', line 360

def get_pod_events(pod_name)
  return if pod_name.blank?
  `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
end

#handle_files(files) ⇒ Object



33
34
35
36
37
# File 'lib/cnvrg/helpers/executer.rb', line 33

def handle_files(files)
  (files || {}).each do |path, content|
    create_file_cmd(path, content)
  end
end

#initObject



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/cnvrg/helpers/executer.rb', line 162

def init
  retries = 0
  success = false
  puts("Agent started, connecting to #{Cnvrg::API.get_api}")
  STDOUT.flush
  wait_for_main
  while !success and retries < 100
    begin
      resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
      if !resp
        raise StandardError.new("Failed to send request to server")
      end
      machine_activity = resp["machine_activity"]
      success = true
      puts("Connected to server")
      STDOUT.flush
      Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
      if machine_activity.present? and @machine_activity != machine_activity
        Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
        machine_activity_yml = {slug: machine_activity}
        File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
        @machine_activity = machine_activity
      end
    rescue => e
      Cnvrg::Logger.log_error(e)
      Cnvrg::Logger.info("Sleeping for #{5 * retries}")
      sleep(5 * retries)
      retries +=1
    end
  end
end

#main_envObject



119
120
121
# File 'lib/cnvrg/helpers/executer.rb', line 119

def main_env
  run_in_main("env").split("\n").map{|x| x.split("=")}
end

#main_homedirObject



115
116
117
# File 'lib/cnvrg/helpers/executer.rb', line 115

def main_homedir()
  run_in_main("env | grep -w HOME").split("=").try(:last)
end

#main_threadObject



208
209
210
211
212
213
214
215
216
217
# File 'lib/cnvrg/helpers/executer.rb', line 208

def main_thread
  init
  Thread.new do
    polling_thread
  end
  Thread.new do
    check_main_is_working_thread
  end
  execute_cmds
end

#merge_log_block(logs) ⇒ Object



320
321
322
323
# File 'lib/cnvrg/helpers/executer.rb', line 320

def merge_log_block(logs)
  logs.group_by {|log| log[:timestamp].to_s}
      .map {|ts, logz| {timestamp: ts, logs: logz.map {|l| l[:log]}.join("\n")}}
end

#pollObject



152
153
154
155
156
157
158
159
160
# File 'lib/cnvrg/helpers/executer.rb', line 152

def poll
  resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
  commands = resp["commands"]
  files = resp["files"]
  handle_files(files)
  commands.each{|cmd| @commands_q.push(cmd)}
rescue => e
  Cnvrg::Logger.log_error(e)
end

#polling_threadObject



194
195
196
197
198
199
# File 'lib/cnvrg/helpers/executer.rb', line 194

def polling_thread
  while true
    poll
    sleep(@poll_every)
  end
end

#pre_pod_stopObject



336
337
338
339
340
341
# File 'lib/cnvrg/helpers/executer.rb', line 336

def pre_pod_stop
  pod_name, node_name = get_node_and_pod_names
  pod_events = get_pod_events(pod_name)
  node_events = get_node_events(node_name)
  Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
end

#run_in_main(command) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/cnvrg/helpers/executer.rb', line 123

def run_in_main(command)
  data = {cmd: command, async: false, use_sh: true}

  conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
  response = conn.post('command', data.to_json)
  if response.to_hash[:status].to_i != 200
    Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
    return ""
  end
  resp = []
  lines = response.body.split("\n")
  lines.each do |line|
    next if line.strip == nil or line.strip == ""
    if line.include?("cnvrg-exit-code")
      exit_status = line.split("=")[1].to_i
      if exit_status != 0
        Cnvrg::Logger.log_info("failed to run find command #{command} on main")
        return ""
      end
      next
    end
    resp << line
  end
  return resp.join("\n")
rescue => e
  Cnvrg::Logger.log_error(e)
  return ""
end

#spark_pathObject



111
112
113
# File 'lib/cnvrg/helpers/executer.rb', line 111

def spark_path
  run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
end

#start_tiny_if_missingObject



262
263
264
265
266
267
268
269
270
271
272
# File 'lib/cnvrg/helpers/executer.rb', line 262

def start_tiny_if_missing
  return unless ENV['MAIN_CONTAINER_PORT'].blank?
  Cnvrg::Logger.log_info("Tiny not found, starting it")
  @agent_id, @main_id = containers
  pid = Process.fork do
    Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
    `docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
  end
  Process.detach(pid)
  Cnvrg::Logger.log_info("Tiny started and detached")
end

#wait_for_mainObject



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/cnvrg/helpers/executer.rb', line 219

def wait_for_main
  copy_file_to_main
  start_tiny_if_missing
  retries = 0
  puts("Waiting for main container")
  STDOUT.flush
  got_response = false
  while !got_response do
    begin
      conn = Cnvrg::Helpers::Executer.get_main_conn
      response = conn.get('readiness')
      if response.to_hash[:status].to_i != 200
        sleep(0.1)
        next
      else
        puts("Client container is ready")
        STDOUT.flush
        @main_start_time = response.body.to_i
        got_response = true
      end
    rescue => e
      retries += 1
      if retries > 3
        puts("Failed to connect to main")
        puts(e.message)
        STDOUT.flush
      end
      sleep(0.1)
      next
    end
  end
end