Module: CodeRunner::Slurm

Includes:
Launcher
Included in:
Helios, Stampede
Defined in:
lib/coderunner/system_modules/slurm.rb

Overview

used on certain HPC systems.

Instance Method Summary collapse

Methods included from Launcher

#cancel_job_launcher, #error_file_launcher, #execute_launcher, #launcher_prefix, #output_file_launcher, #queue_status_launcher, #use_launcher

Instance Method Details

#batch_scriptObject



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/coderunner/system_modules/slurm.rb', line 49

def batch_script

	nodes, ppn = @nprocs.split(/x/)
	eputs "Warning: Underuse of nodes (#{ppn} cores per node instead of #{max_ppn})" if ppn.to_i < max_ppn 
	raise "Error: cores per node cannot excede #{max_ppn}" if ppn.to_i > max_ppn
#		raise "Error: project (i.e. budget) not specified" unless @project
	ppn ||= max_ppn
	raise "Please specify wall minutes" unless @wall_mins
	if @wall_mins
		ep @wall_mins
		hours = (@wall_mins / 60).floor
		mins = @wall_mins.to_i % 60
		secs = ((@wall_mins - @wall_mins.to_i) * 60).to_i
	end
	eputs "Allotted wall time is " + sprintf("%02d:%02d:%02d", hours, mins, secs)
	nprocstot = nodes.to_i * ppn.to_i
<<EOF
#!/bin/bash
#SBATCH -J #{executable_name}.#{job_identifier} # jobname
#SBATCH -N #{nodes.to_i}        # number of nodes
#SBATCH -n #{nprocstot}         # number of tasks
#SBATCH -o #{executable_name}.#{job_identifier}.o%j              # strout filename (%j is jobid)
#SBATCH -e #{executable_name}.#{job_identifier}.e%j               # stderr filename (%j is jobid)
#{@project ? "#SBATCH -A #@project # project to charge" : ""}
#{@queue ? "#SBATCH -p #@queue # submission queue" : ""}
#{@wall_mins ? "#SBATCH -t #{sprintf("%02d:%02d:%02d", hours, mins, secs)} # walltime" : ""}

#{code_run_environment}
echo "Submitting #{nodes}x#{ppn} job on #{CodeRunner::SYS} for project #@project..."



EOF

end

#batch_script_fileObject



41
42
43
# File 'lib/coderunner/system_modules/slurm.rb', line 41

def batch_script_file
	"#{executable_name}.#{job_identifier}.sh"
end

#cancel_jobObject



85
86
87
# File 'lib/coderunner/system_modules/slurm.rb', line 85

def cancel_job
	use_launcher ? cancel_job_launcher : `scancel #{@job_no}`
end

#error_fileObject



89
90
91
92
# File 'lib/coderunner/system_modules/slurm.rb', line 89

def error_file
	use_launcher ? error_file_launcher :
     "#{executable_name}.#{job_identifier}.e#@job_no"
end

#executeObject



31
32
33
34
35
36
37
38
39
# File 'lib/coderunner/system_modules/slurm.rb', line 31

def execute
	if use_launcher
     return execute_launcher
	else
		File.open(batch_script_file, 'w'){|file| file.puts batch_script + run_command + "\n"}
		_pid = %x[sbatch #{batch_script_file}].to_i
		return nil
	end
end

#get_run_status(job_no, current_status) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/coderunner/system_modules/slurm.rb', line 99

def get_run_status(job_no, current_status)
	if use_launcher
		return :Unknown
	end
	line = current_status.split(/\n/).grep(Regexp.new(job_no.to_s))[0]
	unless line
		return :Unknown
	else 
		@running = true
		if line =~ /\sPD\s/
			return :Queueing
		elsif line =~ /\sR\s/
			return :Running
		elsif line =~ /\sC\s/
			@running = false
			return :Unknown
		else
			ep 'line', line
			raise 'Could not get run status'
		end
	end
end

#max_ppnObject



45
46
47
# File 'lib/coderunner/system_modules/slurm.rb', line 45

def max_ppn
	raise "Please define max_ppn for your system"
end

#mpi_progObject



25
26
27
28
29
# File 'lib/coderunner/system_modules/slurm.rb', line 25

def mpi_prog
 nodes, ppn = @nprocs.split(/x/)
 nprocstot = nodes.to_i * ppn.to_i
  "mpirun -np #{nprocstot}"
end

#output_fileObject



94
95
96
97
# File 'lib/coderunner/system_modules/slurm.rb', line 94

def output_file
	use_launcher ? output_file_launcher :
     "#{executable_name}.#{job_identifier}.o#@job_no"
end

#queue_statusObject



8
9
10
11
12
13
14
15
# File 'lib/coderunner/system_modules/slurm.rb', line 8

def queue_status
	if use_launcher
     queue_status_launcher
	else
		#%x[squeue | grep #{ENV['USER'][0..7]}]
		%x[squeue -u $USER]
	end
end

#run_commandObject



17
18
19
20
21
22
23
24
# File 'lib/coderunner/system_modules/slurm.rb', line 17

def run_command
# 		"qsub #{batch_script_file}"
	if use_launcher
		return %[mpiexec -np #{@nprocs} #{executable_location}/#{executable_name} #{parameter_string} > #{output_file_launcher} 2> #{error_file_launcher}]
	else
		"#@preamble #{mpi_prog}  #{executable_location}/#{executable_name} #{parameter_string}"
	end
end