Module: Bio::Pipengine

Defined in:
lib/bio/pipengine.rb,
lib/bio/pipengine/job.rb,
lib/bio/pipengine/step.rb,
lib/bio/pipengine/sample.rb

Defined Under Namespace

Classes: Job, Sample, Step

Constant Summary collapse

@@logger_error =
Logger.new(STDERR)

Class Method Summary collapse

Class Method Details

.add_job(job, pipeline, step_name, sample) ⇒ Object

def self.check_config unless File.exists?(“#Dir.home/.torque_rm.yaml”) ARGV.clear current_user = Etc.getlogin puts “nIt seems you are running PipEngine for the first time. Please fill in the following information:” print “nHostname or IP address of authorized server from where jobs will be submitted: ”.light_blue server = gets.chomp print “n” print “Specify the username you will be using to connect and submit jobs [#current_user]: ”.light_blue username = gets.chomp username = (username == “”) ? current_user : username puts “Attempting connection to the server…”.green path = ‘ssh #username@#server -t “which qsub”`.split(“/qsub”).first unless path=~//S+/S+/ warn “Connection problems detected! Please check that you are able to connect to ’#server‘ as ’#username‘ via ssh.”.red else file = File.open(“#Dir.home/.torque_rm.yaml”,“w”) file.write(=> server, :path => path, :user => username.to_yaml) file.close puts “First time configuration completed!”.green puts “It is strongly recommended to setup a password-less SSH connection to use PipEngine.”.green exit end end end #check_config



241
242
243
244
245
# File 'lib/bio/pipengine.rb', line 241

def self.add_job(job, pipeline, step_name, sample)
  step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions
  self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite?
  job.add_step(step,sample) # adding step command lines to the job 
end

.check_and_run_multi(samples_file, pipeline, samples_list, options) ⇒ Object

handle steps that run on multiple samples (i.e. sample groups job)



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/bio/pipengine.rb', line 72

def self.check_and_run_multi(samples_file,pipeline,samples_list,options)
  step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?}
  
  if step_multi.include? false
    if step_multi.uniq.size > 1
      @@logger_error.error "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red
      exit
    else
      return false
    end
  else
    samples_obj = {}
    samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name],options[:group])}
    create_job(samples_file,pipeline,samples_list,options,samples_obj)
    return true
  end
end

.check_samples(passed_samples, samples) ⇒ Object

check if sample exists



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/bio/pipengine.rb', line 133

def self.check_samples(passed_samples,samples)
  passed_samples.each do |sample|
    samples_names = []
    samples["samples"].each_key do |k|
      if samples["samples"][k].kind_of? Hash
        samples["samples"][k].each_key {|s| samples_names << s}
      else
        samples_names << k
      end
    end
    unless samples_names.include? sample
      @@logger_error.error "Sample \"#{sample}\" does not exist in sample file!".red
      exit
    end
  end
end

.check_steps(passed_steps, pipeline) ⇒ Object

check if step exists



151
152
153
154
155
156
157
158
# File 'lib/bio/pipengine.rb', line 151

def self.check_steps(passed_steps,pipeline)
  passed_steps.each do |step|
    unless pipeline["steps"].keys.include? step
      @@logger_error.error "Step \"#{step}\" does not exist in pipeline file!".red
      exit
    end
  end
end

.create_job(samples_file, pipeline, samples_list, options, sample) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/bio/pipengine.rb', line 90

def self.create_job(samples_file,pipeline,samples_list,options,sample)
  # getting the sample name (only if this is not a multi samples job)
  sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-"
  # setting the job name
  job_name = nil
  if options[:name] 
    job_name = options[:name]
  elsif options[:steps].size > 1
    job_name = "#{sample_name}#{options[:steps].join("-")}"
  else
    job_name = "#{sample_name}#{options[:steps].first}"
  end  
  # creating the Job object
  job = Bio::Pipengine::Job.new(job_name)
  job.local = options[:tmp]
  job.custom_output = options[:output_dir]
  job.custom_name = (options[:name]) ? options[:name] : nil
  # Adding pipeline and samples resources
  job.add_resources pipeline["resources"]
  job.add_resources samples_file["resources"]
  # Adding resource tag from the command line which can overwrite resources defined in the pipeline and samples files
  job.add_resources parse_tag_option(options[:tag])
  #setting the logging system
  job.log = options[:log]
  job.log_adapter = options[:log_adapter]
  # setting sample groups either by cli option (if present) or by taking all available samples
  job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys
  job.samples_obj = sample if sample.kind_of? Hash
  # cycling through steps and add command lines to the job
  options[:steps].each do |step_name| 
    # TODO WARNING this can add multiple times the same step if the are multi dependencies
    self.add_job(job, pipeline, step_name, sample)
  end

  if options[:dry]
    job.to_script(options)
  else
    job.to_script(options)
    job.submit
  end
end

.create_samples(dir) ⇒ Object

create the samples.yml file



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/bio/pipengine.rb', line 174

def self.create_samples(dir)
    File.open("samples.yml","w") do |file|
        file.write "resources:\n\soutput: #{`pwd -L`}\n\nsamples:\n"
      samples = Hash.new {|hash,key| hash[key] = []}
      dir.each do |path|
        projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/}
        unless projects.empty?
          projects.each do |project_folder|
            Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s}
          end
        else
          Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s}
        end
      end
      samples.each_key do |sample|
        file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n" 
      end
    end
end

.include(name, filename) ⇒ Object



4
5
6
# File 'lib/bio/pipengine.rb', line 4

def self.include(name, filename)
  File.readlines(filename).map {|line| "  "+line}.join("\n")
end

.inspect_steps(pipeline_file) ⇒ Object

load the pipeline file and show a list of available steps



161
162
163
164
165
166
167
168
169
170
171
# File 'lib/bio/pipengine.rb', line 161

def self.inspect_steps(pipeline_file)
  pipeline = YAML.load_file pipeline_file
  print "\nPipeline: ".blue 
  print "#{pipeline["pipeline"]}\n\n".green
  puts "List of available steps:".light_blue
  pipeline["steps"].each_key do |s|
    print "\s\s#{s}:\s\s".blue 
    print "#{pipeline["steps"][s]["desc"]}\n".green
  end
  puts "\n"
end

.load_samples_file(file) ⇒ Object

add_job



247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/bio/pipengine.rb', line 247

def self.load_samples_file(file)
  samples_file = YAML.load_file file
  samples_file["samples"].each do |k,v|
    if v.kind_of? Hash
      samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }] 
    else
      samples_file["samples"][k] = v.to_s
    end
  end
  # make sure everything in Samples and Resources is converted to string
  #samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }] 
  samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}] 
  samples_file 
end

.parse_tag_option(option_tag) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/bio/pipengine.rb', line 53

def self.parse_tag_option(option_tag)
  if !option_tag
    return {} 
  else
    tags = {}
    option_tag.each do |tag|
      values = tag.split("=")
      if values.empty?
        @@logger_error.error "\nAbort! Unrecognized values for tag option, please provide the tags as follows: tag1=value1 tag2=value2".red
        exit
      else
        tags.merge! Hash[*values.flatten]
      end
    end
    return tags
  end  
end

.run(options) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/bio/pipengine.rb', line 9

def self.run(options)

  # reading the yaml files
  pipeline = YAML.load ERB.new(File.read(options[:pipeline])).result(binding)
  samples_file = load_samples_file options[:samples_file]
  # pre-running checks 
  check_steps(options[:steps],pipeline)  
  check_samples(options[:samples],samples_file) if options[:samples]

  # list of samples the jobs will work on
  samples_list = nil
  # check if a group is specified
  if options[:group]
    samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]]
    options[:multi] = samples_list.keys 
    samples_file["resources"]["output"] << "/#{options[:group]}"  
  else # if not, proceed normalizing the sample list to remove groups and get a list of all samples
    full_list_samples = {}
    samples_file["samples"].each_key do |k| 
      if samples_file["samples"][k].kind_of? Hash
        full_list_samples.merge! samples_file["samples"][k]
      else
        full_list_samples[k] = samples_file["samples"][k]
      end
    end
    samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples
  end
    
  ########### START ###########

  # create output directory (jobs scripts will be saved there)
  FileUtils.mkdir_p samples_file["resources"]["output"] unless options[:dry] #&& options[:spooler]!="pbs"

  # check if the requested steps are multi-samples
  run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options)
  
  unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample
    samples_list.each_key do |sample_name|
        sample = Bio::Pipengine::Sample.new(sample_name.to_s,samples_list[sample_name],options[:group])
      create_job(samples_file,pipeline,samples_list,options,sample)
    end
  end
end