Module: Bio::Pipengine

Defined in:
lib/bio/pipengine.rb,
lib/bio/pipengine/job.rb,
lib/bio/pipengine/step.rb,
lib/bio/pipengine/sample.rb

Defined Under Namespace

Classes: Job, Sample, Step

Constant Summary collapse

@@logger_error =
Logger.new(STDERR)

Class Method Summary collapse

Class Method Details

.add_job(job, pipeline, step_name, sample) ⇒ Object

def self.check_config unless File.exists?(“#Dir.home/.torque_rm.yaml”) ARGV.clear current_user = Etc.getlogin puts “nIt seems you are running PipEngine for the first time. Please fill in the following information:” print “nHostname or IP address of authorized server from where jobs will be submitted: ”.light_blue server = gets.chomp print “n” print “Specify the username you will be using to connect and submit jobs [#current_user]: ”.light_blue username = gets.chomp username = (username == “”) ? current_user : username puts “Attempting connection to the server…”.green path = ‘ssh #username@#server -t “which qsub”`.split(“/qsub”).first unless path=~//S+/S+/ warn “Connection problems detected! Please check that you are able to connect to ’#server‘ as ’#username‘ via ssh.”.red else file = File.open(“#Dir.home/.torque_rm.yaml”,“w”) file.write(=> server, :path => path, :user => username.to_yaml) file.close puts “First time configuration completed!”.green puts “It is strongly recommended to setup a password-less SSH connection to use PipEngine.”.green exit end end end #check_config



253
254
255
256
257
# File 'lib/bio/pipengine.rb', line 253

def self.add_job(job, pipeline, step_name, sample)
	step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions
	self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite?
	job.add_step(step,sample) # adding step command lines to the job	
end

.check_and_run_multi(samples_file, pipeline, samples_list, options) ⇒ Object

handle steps that run on multiple samples (i.e. sample groups job)



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/bio/pipengine.rb', line 84

def self.check_and_run_multi(samples_file,pipeline,samples_list,options)
	step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?}
	
	if step_multi.include? false
		if step_multi.uniq.size > 1
			@@logger_error.error "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red
			exit
		else
			return false
		end
	else
		samples_obj = {}
		samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name],options[:group])}
		create_job(samples_file,pipeline,samples_list,options,samples_obj)
		return true
	end
end

.check_samples(passed_samples, samples) ⇒ Object

check if sample exists



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/bio/pipengine.rb', line 145

def self.check_samples(passed_samples,samples)
	passed_samples.each do |sample|
		samples_names = []
		samples["samples"].each_key do |k|
			if samples["samples"][k].kind_of? Hash
				samples["samples"][k].each_key {|s| samples_names << s}
			else
				samples_names << k
			end
		end
		unless samples_names.include? sample
			@@logger_error.error "Sample \"#{sample}\" does not exist in sample file!".red
			exit
		end
	end
end

.check_steps(passed_steps, pipeline) ⇒ Object

check if step exists



163
164
165
166
167
168
169
170
# File 'lib/bio/pipengine.rb', line 163

def self.check_steps(passed_steps,pipeline)
	passed_steps.each do |step|
		unless pipeline["steps"].keys.include? step
			@@logger_error.error "Step \"#{step}\" does not exist in pipeline file!".red
			exit
		end
	end
end

.create_job(samples_file, pipeline, samples_list, options, sample) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/bio/pipengine.rb', line 102

def self.create_job(samples_file,pipeline,samples_list,options,sample)
	# getting the sample name (only if this is not a multi samples job)
	sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-"
	# setting the job name
	job_name = nil
	if options[:name] 
		job_name = options[:name]
	elsif options[:steps].size > 1
		job_name = "#{sample_name}#{options[:steps].join("-")}"
	else
		job_name = "#{sample_name}#{options[:steps].first}"
	end	
	# creating the Job object
	job = Bio::Pipengine::Job.new(job_name)
	job.local = options[:tmp]
	job.custom_output = options[:output_dir]
	job.custom_name = (options[:name]) ? options[:name] : nil
	# Adding pipeline and samples resources
	job.add_resources pipeline["resources"]
	job.add_resources samples_file["resources"]
	# Adding resource tag from the command line which can overwrite resources defined in the pipeline and samples files
	job.add_resources parse_tag_option(options[:tag])
	#setting the logging system
	job.log = options[:log]
	job.log_adapter = options[:log_adapter]
	# setting sample groups either by cli option (if present) or by taking all available samples
	job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys
	job.samples_obj = sample if sample.kind_of? Hash
	# cycling through steps and add command lines to the job
	options[:steps].each do |step_name| 
		# TODO WARNING this can add multiple times the same step if the are multi dependencies
		self.add_job(job, pipeline, step_name, sample)
	end

	if options[:dry]
		job.to_script(options)
	else
		job.to_script(options)
		job.submit
	end
end

.create_samples(dir) ⇒ Object

create the samples.yml file



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/bio/pipengine.rb', line 186

def self.create_samples(dir)
		File.open("samples.yml","w") do |file|
				file.write "resources:\n\soutput: #{`pwd -L`}\n\nsamples:\n"
			samples = Hash.new {|hash,key| hash[key] = []}
			dir.each do |path|
				projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/}
				unless projects.empty?
					projects.each do |project_folder|
						Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s}
					end
				else
					Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s}
				end
			end
			samples.each_key do |sample|
				file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n"	
			end
		end
end

.include(name, filename) ⇒ Object



4
5
6
# File 'lib/bio/pipengine.rb', line 4

def self.include(name, filename)
	File.readlines(filename).map {|line| "  "+line}.join("\n")
end

.inspect_steps(pipeline_file) ⇒ Object

load the pipeline file and show a list of available steps



173
174
175
176
177
178
179
180
181
182
183
# File 'lib/bio/pipengine.rb', line 173

def self.inspect_steps(pipeline_file)
	pipeline = YAML.load_file pipeline_file
	print "\nPipeline: ".blue 
	print "#{pipeline["pipeline"]}\n\n".green
	puts "List of available steps:".light_blue
	pipeline["steps"].each_key do |s|
		print "\s\s#{s}:\s\s".blue 
		print "#{pipeline["steps"][s]["desc"]}\n".green
	end
	puts "\n"
end

.load_samples_file(file) ⇒ Object

add_job



259
260
261
262
263
264
265
266
267
268
269
270
271
272
# File 'lib/bio/pipengine.rb', line 259

def self.load_samples_file(file)
	samples_file = YAML.load_file file
	samples_file["samples"].each do |k,v|
		if v.kind_of? Hash
			samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }] 
		else
			samples_file["samples"][k] = v.to_s
		end
	end
	# make sure everything in Samples and Resources is converted to string
	#samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }] 
	samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}]	
	samples_file 
end

.parse_tag_option(option_tag) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/bio/pipengine.rb', line 65

def self.parse_tag_option(option_tag)
	if !option_tag
		return {}	
	else
		tags = {}
		option_tag.each do |tag|
			values = tag.split("=")
			if values.empty?
				@@logger_error.error "\nAbort! Unrecognized values for tag option, please provide the tags as follows: tag1=value1 tag2=value2".red
				exit
			else
				tags.merge! Hash[*values.flatten]
			end
		end
		return tags
	end	
end

.run(options) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/bio/pipengine.rb', line 9

def self.run(options)

	# reading the yaml files
	pipeline = YAML.load ERB.new(File.read(options[:pipeline])).result(binding)
	samples_file = load_samples_file options[:samples_file]
	
	# make sure all sample names are always Strings
	converted_samples_list = {}
	samples_file["samples"].each_key do |sample|
		if samples_file["samples"][sample].kind_of? Hash # it's a group of samples
			converted_samples_list[sample.to_s] = Hash[samples_file["samples"][sample].map{ |k, v| [k.to_s, v] }]
		else
			converted_samples_list[sample.to_s] = samples_file["samples"][sample]
		end
	end
	samples_file["samples"] = converted_samples_list # replacing original samples hash with the converted one

	# pre-running checks	
	check_steps(options[:steps],pipeline)	
	check_samples(options[:samples],samples_file) if options[:samples]

	# list of samples the jobs will work on
	samples_list = nil
	# check if a group is specified
	if options[:group]
		samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]]
		options[:multi] = samples_list.keys 
		samples_file["resources"]["output"] << "/#{options[:group]}"	
	else # if not, proceed normalizing the sample list to remove groups and get a list of all samples
		full_list_samples = {}
		samples_file["samples"].each_key do |k| 
			if samples_file["samples"][k].kind_of? Hash
				full_list_samples.merge! samples_file["samples"][k]
			else
				full_list_samples[k] = samples_file["samples"][k]
			end
		end
		samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples
	end
		
	########### START ###########

	# create output directory (jobs scripts will be saved there)
	FileUtils.mkdir_p samples_file["resources"]["output"] #unless options[:dry] #&& options[:spooler]!="pbs"

	# check if the requested steps are multi-samples
	run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options)
	
	unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample
		samples_list.each_key do |sample_name|
				sample = Bio::Pipengine::Sample.new(sample_name.to_s,samples_list[sample_name],options[:group])
			create_job(samples_file,pipeline,samples_list,options,sample)
		end
	end
end