Module: CodeRunner::Slurm

Included in:
Helios, Stampede
Defined in:
lib/coderunner/system_modules/slurm.rb

Overview

A module to let CodeRunner run using the SLURM queue system, used on certain HPC systems.

Instance Method Summary collapse

Instance Method Details

#batch_scriptObject



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/coderunner/system_modules/slurm.rb', line 53

def batch_script

  nodes, ppn = @nprocs.split(/x/)
  eputs "Warning: Underuse of nodes (#{ppn} cores per node instead of #{max_ppn})" if ppn.to_i < max_ppn 
  raise "Error: cores per node cannot excede #{max_ppn}" if ppn.to_i > max_ppn
#   raise "Error: project (i.e. budget) not specified" unless @project
  ppn ||= max_ppn
  raise "Please specify wall minutes" unless @wall_mins
  if @wall_mins
    ep @wall_mins
    hours = (@wall_mins / 60).floor
    mins = @wall_mins.to_i % 60
    secs = ((@wall_mins - @wall_mins.to_i) * 60).to_i
  end
  eputs "Allotted wall time is " + sprintf("%02d:%02d:%02d", hours, mins, secs)
  nprocstot = nodes.to_i * ppn.to_i
"#!/bin/bash\n#SBATCH -J \#{executable_name}.\#{job_identifier} # jobname\n#SBATCH -N \#{nodes.to_i}        # number of nodes\n#SBATCH -n \#{nprocstot}         # number of tasks\n#SBATCH -o \#{executable_name}.\#{job_identifier}.o%j              # strout filename (%j is jobid)\n#SBATCH -e \#{executable_name}.\#{job_identifier}.e%j               # stderr filename (%j is jobid)\n\#{@project ? \"#SBATCH -A \#@project # project to charge\" : \"\"}\n\#{@queue ? \"#SBATCH -p \#@queue # submission queue\" : \"\"}\n\#{@wall_mins ? \"#SBATCH -t \#{sprintf(\"%02d:%02d:%02d\", hours, mins, secs)} # walltime\" : \"\"}\n\n\#{code_run_environment}\necho \"Submitting \#{nodes}x\#{ppn} job on \#{CodeRunner::SYS} for project \#@project...\"\n\n\n\n"

end

#batch_script_fileObject



45
46
47
# File 'lib/coderunner/system_modules/slurm.rb', line 45

def batch_script_file
  "#{executable_name}.#{job_identifier}.sh"
end

#cancel_jobObject



89
90
91
92
93
94
95
96
# File 'lib/coderunner/system_modules/slurm.rb', line 89

def cancel_job
  if ((prefix = ENV['CODE_RUNNER_LAUNCHER']).size > 0 rescue false)
     fname = ENV['HOME'] + "/.coderunner_to_launch_#{prefix}/#{$$}.stop"
     File.open(fname, 'w'){|file| file.puts "\n"}
  else
    `scancel #{@job_no}`
  end
end

#error_fileObject



98
99
100
# File 'lib/coderunner/system_modules/slurm.rb', line 98

def error_file
  return "#{executable_name}.#{job_identifier}.e#@job_no"
end

#executeObject



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/coderunner/system_modules/slurm.rb', line 29

def execute
  if ((prefix = ENV['CODE_RUNNER_LAUNCHER']).size > 0 rescue false)
    launch_id = "#{Time.now.to_i}#{$$}"
    fname = ENV['HOME'] + "/.coderunner_to_launch_#{prefix}/#{launch_id}"
    File.open(fname + '.start', 'w'){|file| file.puts "cd #{Dir.pwd};#{run_command}"}
    sleep 1 until FileTest.exist? fname + '.pid'
    pid = File.read(fname + '.pid').to_i
    FileUtils.rm fname + '.pid'
    return pid
  else
    File.open(batch_script_file, 'w'){|file| file.puts batch_script + run_command + "\n"}
    pid = %x[sbatch #{batch_script_file}].to_i
    return nil
  end
end

#get_run_status(job_no, current_status) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/coderunner/system_modules/slurm.rb', line 106

def get_run_status(job_no, current_status)
  if ((prefix = ENV['CODE_RUNNER_LAUNCHER']).size > 0 rescue false)
    return :Unknown
  end
  line = current_status.split(/\n/).grep(Regexp.new(job_no.to_s))[0]
  unless line
    return :Unknown
  else 
    @running = true
    if line =~ /\sPD\s/
      return :Queueing
    elsif line =~ /\sR\s/
      return :Running
    elsif line =~ /\sC\s/
      @running = false
      return :Unknown
    else
      ep 'line', line
      raise 'Could not get run status'
    end
  end
end

#max_ppnObject



49
50
51
# File 'lib/coderunner/system_modules/slurm.rb', line 49

def max_ppn
  raise "Please define max_ppn for your system"
end

#mpi_progObject



23
24
25
26
27
# File 'lib/coderunner/system_modules/slurm.rb', line 23

def mpi_prog
 nodes, ppn = @nprocs.split(/x/)
 nprocstot = nodes.to_i * ppn.to_i
  "mpirun -np #{nprocstot}"
end

#output_fileObject



102
103
104
# File 'lib/coderunner/system_modules/slurm.rb', line 102

def output_file
  return "#{executable_name}.#{job_identifier}.o#@job_no"
end

#queue_statusObject



6
7
8
9
10
11
12
13
# File 'lib/coderunner/system_modules/slurm.rb', line 6

def queue_status
  if ((prefix = ENV['CODE_RUNNER_LAUNCHER']).size > 0 rescue false)
    %x[cat #{ENV['HOME']}/.coderunner_to_launch_#{prefix}/queue_status.txt]  +
    %x[cat #{ENV['HOME']}/.coderunner_to_launch_#{prefix}/queue_status2.txt] 
  else
    %x[squeue | grep #{ENV['USER'][0..7]}]
  end
end

#run_commandObject



15
16
17
18
19
20
21
22
# File 'lib/coderunner/system_modules/slurm.rb', line 15

def run_command
#     "qsub #{batch_script_file}"
  if (ENV['CODE_RUNNER_LAUNCHER'].size > 0 rescue false)
    return %[mpiexec -np #{@nprocs} #{executable_location}/#{executable_name} #{parameter_string} > #{output_file} 2> #{error_file}]
  else
    "#@preamble #{mpi_prog}  #{executable_location}/#{executable_name} #{parameter_string}"
  end
end