Class: WorkflowManager::FGCZCluster

Inherits:
Cluster
  • Object
show all
Defined in:
lib/workflow_manager/cluster.rb

Direct Known Subclasses

FGCZCourseCluster

Instance Attribute Summary

Attributes inherited from Cluster

#log_dir, #name, #options

Instance Method Summary collapse

Methods inherited from Cluster

#default_node, #generate_new_job_script, #initialize

Constructor Details

This class inherits a constructor from WorkflowManager::Cluster

Instance Method Details

#cluster_nodesObject



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/workflow_manager/cluster.rb', line 218

def cluster_nodes
  nodes = {
    'fgcz-c-043: cpu 24,mem  23 GB,scr  11T' => 'fgcz-c-043',
    'fgcz-c-044: cpu 16,mem 128 GB,scr 500G' => 'fgcz-c-044',
    'fgcz-c-045: cpu 64,mem 504 GB,scr  15T' => 'fgcz-c-045',
    'fgcz-c-046: cpu 64,mem 504 GB,scr  11T' => 'fgcz-c-046',
    'fgcz-c-047: cpu 32,mem   1 TB,scr  28T' => 'fgcz-c-047',
    'fgcz-c-048: cpu 48,mem 252 GB,scr 3.5T' => 'fgcz-c-048',
    'fgcz-c-049: cpu  8,mem  63 GB,scr 1.7T' => 'fgcz-c-049',
    'fgcz-c-051: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-051',
    'fgcz-c-052: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-052',
    'fgcz-c-053: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-053',
    'fgcz-c-054: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-054',
    'fgcz-c-055: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-055',
    'fgcz-c-057: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-057',
    'fgcz-c-058: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-058',
    'fgcz-c-059: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-059',
    'fgcz-c-061: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-061',
    'fgcz-c-063: cpu 12,mem  70 GB,scr 450G' => 'fgcz-c-063',
    'fgcz-c-065: cpu 24,mem  70 GB,scr 197G' => 'fgcz-c-065',
    'fgcz-h-004: cpu 8,mem  30 GB,scr 400G' => 'fgcz-h-004',
    'fgcz-h-009: cpu 8,mem  30 GB,scr 500G' => 'fgcz-h-009',
    'fgcz-h-010: cpu 8,mem  30 GB,scr 400G' => 'fgcz-h-010',
  }
end

#copy_commands(org_dir, dest_parent_dir, now = nil, queue = "light") ⇒ Object



202
203
204
205
206
207
208
209
210
211
# File 'lib/workflow_manager/cluster.rb', line 202

def copy_commands(org_dir, dest_parent_dir, now=nil, queue="light")
  commands = if now == "force"
               target_file = File.join(dest_parent_dir, File.basename(org_dir))
               ["g-req copynow -f #{org_dir} #{dest_parent_dir}"]
             elsif now
               ["g-req copynow #{org_dir} #{dest_parent_dir}"]
             else
               ["g-req -w copy #{org_dir} #{dest_parent_dir}"]
             end
end

#delete_command(target) ⇒ Object



215
216
217
# File 'lib/workflow_manager/cluster.rb', line 215

def delete_command(target)
  command = "g-req remove #{target}"
end

#job_ends?(log_file) ⇒ Boolean

Returns:

  • (Boolean)


177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/workflow_manager/cluster.rb', line 177

def job_ends?(log_file)
  log_flag = false
  IO.popen("tail -n 10 #{log_file} 2> /dev/null") do |io|
    while line=io.gets
      if line =~ /__SCRIPT END__/
        log_flag = true
        break
      end
    end
  end
  log_flag
end

#job_pending?(job_id) ⇒ Boolean

Returns:

  • (Boolean)


189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/workflow_manager/cluster.rb', line 189

def job_pending?(job_id)
 qstat_flag = false
  IO.popen('qstat -u "*"') do |io|
    while line=io.gets
      jobid, prior, name, user, state, *others = line.chomp.split
      if jobid.strip == job_id and state =~ /qw/
        qstat_flag = true
        break
      end
    end
  end
  qstat_flag
end

#job_running?(job_id) ⇒ Boolean

Returns:

  • (Boolean)


164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/workflow_manager/cluster.rb', line 164

def job_running?(job_id)
 qstat_flag = false
  IO.popen('qstat -u "*"') do |io|
    while line=io.gets
      jobid, prior, name, user, state, *others = line.chomp.split
      if jobid.strip == job_id and state == 'r'
        qstat_flag = true
        break
      end
    end
  end
  qstat_flag
end

#kill_command(job_id) ⇒ Object



212
213
214
# File 'lib/workflow_manager/cluster.rb', line 212

def kill_command(job_id)
  command = "qdel #{job_id}"
end

#node_listObject



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/workflow_manager/cluster.rb', line 243

def node_list
  node2scr = {}
  command = "qhost -F scratch"
  keep = nil
  IO.popen(command) do |out|
    while line=out.gets
      hostname, arch, ncpu, loading, memtot, memuse, *others = line.split
      if hostname =~ /fgcz/
        keep = hostname
      elsif scratch_ = line.chomp.split.last and
            scratch = scratch_.split('=').last
        node2scr[keep] = scratch.to_i
        keep = nil
      end
    end
  end

  list = {}
  keep = nil
  command = 'qhost -q'
  IO.popen(command) do |out|
    while line=out.gets
      # HOSTNAME                ARCH         NCPU  LOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
      hostname, arch, ncpu, loading, memtot, memuse, *others = line.split
      if hostname =~ /fgcz/
        #puts [hostname, ncpu, loading, memtot, memuse].join("\t")
        mem = memtot.gsub(/G/, '').to_i
        keep = [hostname, ncpu, "#{mem}G"]
      elsif hostname == "GT" and keep and cores = line.chomp.split.last and cores !~ /[du]/
        hostname = keep.shift
        keep[0] = cores
        if scr = node2scr[hostname] and scr >= 1000
          scr = "%.1f" % (scr.to_f / 1000)
          scr << "T"
        else
          scr = scr.to_s + "G"
        end
        keep << scr
        list[hostname] = keep
        keep = nil
      end
    end
  end

  # reformat
  nodes = {}
  list.each do |hostname, specs|
    # 20190823 masa tentatively off use f47
    unless hostname =~ /fgcz-c-047/
      cores, ram, scr = specs
      key = "#{hostname}: cores #{cores}, ram #{ram}, scr #{scr}"
      value = hostname
      nodes[key] = value
    end
  end
  nodes
end

#submit_job(script_file, script_content, option = '') ⇒ Object



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/workflow_manager/cluster.rb', line 147

def submit_job(script_file, script_content, option='')
  if script_name = File.basename(script_file) and script_name =~ /\.sh/
    script_name = script_name.split(/\.sh/).first + ".sh"
    new_job_script = generate_new_job_script(script_name, script_content)
    new_job_script_base = File.basename(new_job_script)
    log_file = File.join(@log_dir, new_job_script_base + "_o.log")
    err_file = File.join(@log_dir, new_job_script_base + "_e.log")
    command = "g-sub -o #{log_file} -e #{err_file} #{option} #{new_job_script}"
    job_id = `#{command}`
    job_id = job_id.match(/Your job (\d+) \(/)[1]
    [job_id, log_file, command]
  else
    err_msg = "FGCZCluster#submit_job, ERROR: script_name is not *.sh: #{File.basename(script_file)}"
    warn err_msg
    raise err_msg
  end
end