Class: ROCker

Inherits:
Object
  • Object
show all
Defined in:
lib/rocker/step/plot.rb,
lib/rocker.rb,
lib/rocker/step/build.rb,
lib/rocker/step/filter.rb,
lib/rocker/step/search.rb,
lib/rocker/step/compile.rb

Overview

Author:

  • Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>

  • Luis (Coto) Orellana

Constant Summary collapse

@@VERSION =
[ Class ]
"1.1.12"
@@CITATION =
[
"Orellana, Rodriguez-R, & Konstantinidis. Under review.",
"Detecting and quantifying functional genes in short-read",
"metagenomic datasets: method development and application",
"to the nitrogen cycle genes."]
@@DATE =
"2016-05-17"
@@DEFAULTS =
{
   # General
   q: false, r: "R", nucl: false, debug: false, thr: 2, search: :blast,
   # External software
   searchbins: "",
   searchcmd: {
	 blast: '%1$s%2$s -query "%3$s" -db "%4$s" -out "%5$s" ' +
  '-num_threads %6$d -outfmt 6 -max_target_seqs 1',
	 diamond: '%1$sdiamond %2$s -q "%3$s" -d "%4$s" -a "%5$s.daa" -p %6$d' +
  ' -k 1 --min-score 20 --sensitive && %1$sdiamond view -a "%5$s"' +
  ' -o "%5$s"'},
   makedbcmd: {
	 blast: '%1$smakeblastdb -dbtype %2$s -in "%3$s" -out "%4$s"',
	 diamond: '%1$sdiamond makedb --in "%3$s" -d "%4$s"'}
}
@@EBIREST =
[ Class ]
"http://www.ebi.ac.uk/Tools"
@@HAS_BUILD_GEMS =
nil

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts) ⇒ ROCker

Returns a new instance of ROCker.



42
43
44
45
46
# File 'lib/rocker.rb', line 42

def initialize(opts)
   @o = ROCker.defaults
   opts.each{ |k,v| @o[k] = v }
   RInterface.R_BIN = opts[:r] unless opts[:r].nil?
end

Instance Attribute Details

#oObject (readonly)

[ Instance ]


41
42
43
# File 'lib/rocker.rb', line 41

def o
  @o
end

Class Method Details

.CITATION(j = " ") ⇒ Object



38
# File 'lib/rocker.rb', line 38

def self.CITATION(j=" ") @@CITATION.join(j) ; end

.DATEObject



37
# File 'lib/rocker.rb', line 37

def self.DATE; @@DATE ; end

.default(k) ⇒ Object



35
# File 'lib/rocker.rb', line 35

def self.default(k) @@DEFAULTS[k] ; end

.defaultsObject



34
# File 'lib/rocker.rb', line 34

def self.defaults() @@DEFAULTS ; end

.ebirestObject



27
# File 'lib/rocker/step/build.rb', line 27

def self.ebirest() @@EBIREST ; end

.has_build_gems?Boolean

Returns:

  • (Boolean)


28
29
30
31
32
33
34
35
36
37
38
# File 'lib/rocker/step/build.rb', line 28

def self.has_build_gems?
   return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
   @@HAS_BUILD_GEMS = TRUE
   begin
	 require "rubygems"
	 require "restclient"
   rescue LoadError
	 @@HAS_BUILD_GEMS = FALSE
   end
   @@HAS_BUILD_GEMS
end

.VERSIONObject



36
# File 'lib/rocker.rb', line 36

def self.VERSION; @@VERSION ; end

Instance Method Details

#bash(cmd, err_msg = nil) ⇒ Object



59
60
61
62
63
64
# File 'lib/rocker.rb', line 59

def bash(cmd, err_msg=nil)
   o = `#{cmd} 2>&1 && echo '{'`
   raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
	 o[-2]=="{"
   true
end

#blast2table(blast_f, table_f, aln, minscore) ⇒ Object

[ Utilities ]


49
50
51
52
53
54
55
56
57
58
# File 'lib/rocker.rb', line 49

def blast2table(blast_f, table_f, aln, minscore)
   ifh = File.open(blast_f, "r")
   ofh = File.open(table_f, "w")
   while ln = ifh.gets
	 bh = BlastHit.new(ln, aln)
	 ofh.print bh.to_s if bh.bits >= minscore
   end
   ifh.close
   ofh.close
end

#build!Object

[ Build ]


108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# File 'lib/rocker/step/build.rb', line 108

def build!
   # Check requirements
   puts "Testing environment." unless @o[:q]
   {  searchcmd: :search, makedbcmd: :search,
	 alignercmd: :aligner, alignerbin: :aligner,
	 simulatorcmd: :simulator, simulatorbin: :simulator
   }.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
   @o[:nosearch]=true if @o[:nosimulate]
   raise "Unsatisfied requirements, please see the help message (-h)." unless
	 ROCker.has_build_gems?
   protein_set = {}
   protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
   protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
   raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
   raise "-o/--baseout is mandatory." if @o[:baseout].nil?
   if protein_set[:+].size==1 and not @o[:noaln]
	 warn "\nWARNING: Positive set contains only one sequence, turning " +
  "off alignment.\n\n"
	 @o[:noaln] = true
   end
   unless @o[:nosimulate]
	 self.bash("#{@o[:simulatorbin]} --version",
  "--simulator-bin must be executable. Is Grinder installed?") if
  @o[:simulator]==:grinder
   end
   unless @o[:noaln]
	 self.bash("#{@o[:alignerbin]} -version",
  "--aligner-bin must be executable. Is Muscle installed?") if
  @o[:aligner]==:muscle
	 self.bash("#{@o[:alignerbin]} --version",
  "--aligner-bin must be executable. Is ClustalOmega installed?") if
  @o[:aligner]==:clustalo
   end
   unless @o[:nosearch]
	 self.bash("#{@o[:searchbins]}makeblastdb -version",
  "--search-bins must contain executables. Is BLAST+ installed?") if
  @o[:search]==:blast
	 self.bash("#{@o[:searchbins]}diamond --help",
  "--search-bins must contain executables. Is DIAMOND installed?") if
  @o[:search]==:diamond
   end

   # Download genes
   puts "Downloading gene data." unless @o[:q]
   ref_file = @o[:baseout] + ".ref.fasta"
   if not protein_set[:+].aln.nil?
	 puts "  * reusing aligned sequences as positive set." unless @o[:q]
	 protein_set[:+].get_from_aln(ref_file, aln)
	 @o[:noaln] = true
   elsif @o[:reuse] and File.size? ref_file
	 puts "  * reusing positive set: #{ref_file}." unless @o[:q]
   else
	 puts "  * downloading #{protein_set[:+].size} sequence(s) in " +
  "positive set." unless @o[:q]
	 $stderr.puts "   # #{protein_set[:+].ids}" if @o[:debug]
	 protein_set[:+].download(ref_file)
   end
   [:+, :-].each do |set|
      unless protein_set[set].empty?
  puts "  * linking genomes from #{protein_set[set].size} " +
     "[#{set.to_s}] sequence(s)." unless @o[:q]
  $stderr.puts "   # #{protein_set[set].ids}" if @o[:debug]
  protein_set[set].get_genomes!
	 end
   end
   raise "No genomes associated with the positive set." if
	 protein_set[:+].genomes.empty?
   genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
	 :- => GenomeSet.new(self, protein_set[:-].genomes)}
   
   # Locate genes
   puts "Analyzing genome data." unless @o[:q]
   coords_file = @o[:baseout] + ".src.coords"
   if @o[:reuse] and File.size? coords_file
	 puts "  * reusing coordinates: #{coords_file}." unless @o[:q]
	 c = JSON.parse File.read(coords_file), {symbolize_names:true}
	 positive_coords = c[:positive_coords]
	 negative_coords = c[:negative_coords]
	 genome_set[:+].taxa = c[:taxa_pos]
	 genome_set[:-].taxa = c[:taxa_neg]
   else
	 all_coords = {}
	 [:+, :-].each do |set_type|
  all_coords[set_type] = {}
  next if genome_set[set_type].empty?
  thrs = [@o[:thr], genome_set[set_type].size].min
  puts "  * downloading and parsing #{genome_set[set_type].size} " +
     "GFF3 document(s) in #{thrs} threads." unless @o[:q]
  $stderr.puts "   # Looking for translations: " +
     "#{protein_set[set_type].tranids_dump}" if @o[:debug]
  $stderr.puts "   # Looking into: #{genome_set[set_type].ids}" if
     @o[:debug]
  # Launch threads
  thr_obj = []
  (0 .. (thrs-1)).each do |thr_i|
     ids_to_parse = []
     (0 .. (genome_set[set_type].size-1)).each do |i|
 ids_to_parse << protein_set[set_type].genomes[i] if
    (i % thrs) == thr_i
     end
     json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
     thr_obj << json_file
     fork do
 get_coords_from_gff3(ids_to_parse, protein_set[set_type],
    thr_i, json_file)
     end
  end
  # Combine results
  Process.waitall
  thr_obj.each do |t|
     raise "Thread failed without error trace: #{t}" unless
 File.exist? t
     o = JSON.parse(File.read(t), {symbolize_names:true})
     o[:coords].each_pair do |k,v|
 all_coords[set_type][ k ] ||= []
 all_coords[set_type][ k ] += v
     end
     File.unlink t
  end
	 end # [:+, :-].each
	 positive_coords = all_coords[:+]
	 negative_coords = all_coords[:-]
	 # Select one genome per taxon
	 unless @o[:pertaxon].nil?
  puts "  Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
  [:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
	 end
	 # Save coordinates and taxa
	 ofh = File.open(coords_file, "w")
	 ofh.print JSON.pretty_generate({
  positive_coords:positive_coords,
  negative_coords:negative_coords,
  taxa_pos:genome_set[:+].taxa,
  taxa_neg:genome_set[:-].taxa})
	 ofh.close
   end # if @o[:reuse] and File.size? coords_file ... else
   unless @o[:pertaxon].nil?
	 puts "  Using " +
  [:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
  " genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
   end
   found = protein_set[:+].in_coords(positive_coords)
   raise "Cannot find the genomic location of any provided sequence." if
	 found.nil?
   missing = protein_set[:+].ids - found
   warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
	 "sequence(s) #{missing.join(",")}.\n\n" unless missing.empty?
   
   # Download genomes
   genome_set[:all] = GenomeSet.new(self,
	 genome_set[ :+ ].ids + genome_set[ :- ].ids)
   genomes_file = @o[:baseout] + ".src.fasta"
   if @o[:reuse] and File.size? genomes_file
	 puts "  * reusing existing file: #{genomes_file}." unless @o[:q]
   else
	 puts "  * downloading " + genome_set[:all].size.to_s +
  " genome(s) in FastA." unless @o[:q]
	 $stderr.puts "   # #{genome_set[:all].ids}" if @o[:debug]
	 genome_set[:all].download genomes_file
   end
   
   # Generate metagenome
   unless @o[:nosimulate]
	 puts "Generating in silico metagenome" unless @o[:q]
	 if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
  puts "  * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
     @o[:q]
	 else
  all_src = File.readlines("#{@o[:baseout]}.src.fasta"
     ).select{ |l| l =~ /^>/ }.size
  thrs = [@o[:thr], all_src].min
  thr_obj = []
  seqs_per_thr = (all_src.to_f/thrs).ceil
  thrs = (all_src.to_f/seqs_per_thr).ceil
  puts "  * simulating metagenomes and tagging positive reads in " +
     thrs.to_s + " threads." unless @o[:q]
  $stderr.puts "   # #{positive_coords}" if @o[:debug]
  (0 .. (thrs-1)).each do |thr_i|
     output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
     thr_obj << output
     fork do
 seqs_a = thr_i*seqs_per_thr + 1
 seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
 # Create sub-fasta
 ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
 ifh = File.open("#{@o[:baseout]}.src.fasta","r")
 seq_i = 0
 while l = ifh.gets
    seq_i+=1 if l =~ /^>/
     break if seq_i > seqs_b
    ofh.print l if seq_i >= seqs_a
 end
 ifh.close
 ofh.close

 # Run simulator (except if the temporal file is already
 # there and can be reused)
               ofile = "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}"
 bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
    "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
    @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
    File.basename(ofile), File.dirname(ofile)) unless
@o[:reuse] and
File.size? @o[:baseout] +
".mg.tmp.#{thr_i.to_s}-reads.fa"

 # Tag positive and negative reads
 puts "  * tagging reads [thread #{thr_i}]." unless
    @o[:q]
 ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
    "r")
 ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
 while l = ifh.gets
    if l =~ /^>/
rd = %r{
   ^>(?<id>\d+)\s
   reference=[A-Za-z]+\|
   (?<genome_id>[A-Za-z0-9_]+)\|.*\s
   position=(?<comp>complement\()?(?<from>\d+)\.\.
   (?<to>\d+)\)?\s
}x.match(l)
raise "Cannot parse simulated read's defline, are " +
   "you using Grinder?: #{l}" if rd.nil?
positive = false
positive_coords[rd[:genome_id].to_sym] ||= []
positive_coords[rd[:genome_id].to_sym].each do |gn|
   left  = rd[:to].to_i - gn[:from]
   right = gn[:to] - rd[:from].to_i
   if (left*right >= 0) and
	 ([left, right].min >= @o[:minovl])
      positive = true
      break
   end
end
negative = false
negative_coords[rd[:genome_id].to_sym] ||= []
negative_coords[rd[:genome_id].to_sym].each do |gn|
   left  = rd[:to].to_i - gn[:from]
   right = gn[:to] - rd[:from].to_i
   if (left*right >= 0) and
	 ([left, right].min >= @o[:minovl])
      negative = true
      break
   end
end
l = ">#{thr_i.to_s}_#{rd[:id]}" +
   "#{positive ? "@%" : (negative ? "@$" : "")} " +
   "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
   "#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
    end
    ofh.print l
 end
 ofh.close
 ifh.close
     end # fork
  end # (1 .. thrs).each
  Process.waitall
  # Concatenate results
  ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
  thr_obj.each do |t|
     raise "Thread failed without error trace: #{t}" unless
 File.exist? t
     ifh = File.open(t, "r")
     while l = ifh.gets
        ofh.print l
     end
     ifh.close
     File.unlink t
  end
  ofh.close
      end
   end # unless @o[:nosimulate]
   
   # Align references
   unless @o[:noaln]
	 puts "Aligning reference set." unless @o[:q]
	 if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
  puts "  * reusing existing file: #{@o[:baseout]}.ref.aln." unless
     @o[:q]
	 else
  bash(sprintf(@o[:alignercmd],
     @o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
     "#{@o[:baseout]}.ref.aln", @o[:thr]))
  puts "  +--\n  | IMPORTANT NOTE: Manually checking the alignment " +
     "before\n  | the 'compile' step is *strongly* encouraged.\n  " +
     "+--\n" unless @o[:q]
	 end
   end
   
   # Run similarity search
   unless @o[:nosearch]
	 puts "Running similarity search." unless @o[:q]
	 if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
  puts "  * reusing existing file: #{@o[:baseout]}.ref.blast." unless
     @o[:q]
	 else
  puts "  * preparing database." unless @o[:q]
  bash(sprintf(@o[:makedbcmd],
     @o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
     "#{@o[:baseout]}.ref"))
  puts "  * running similarity search." unless @o[:q]
  bash(sprintf(@o[:searchcmd],
     @o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
     "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
	 end
   end
   
   # Clean
   unless @o[:noclean]
	 puts "Cleaning." unless @o[:q]
	 sff  = %w{.src.xml .src.fasta}
	 sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
	 sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
	 sff.each { |sf| File.unlink @o[:baseout] + sf if
  File.exist? @o[:baseout] + sf }
   end
end

#compile!Object

[ Compile ]


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/rocker/step/compile.rb', line 13

def compile!
   raise "-a/--alignment is mandatory." if @o[:aln].nil?
   raise "-a/--alignment must exist." unless File.exist? @o[:aln]
   if @o[:table].nil?
	 raise "-T/--table is mandatory unless -b is provided." if
  @o[:blast].nil? or not File.exist? @o[:blast]
	 @o[:table] = "#{@o[:blast]}.table"
   else
	 @o[:reuse] = true
   end
   raise "-b/--blast is mandatory unless -t exists." if
	 @o[:blast].nil? and not File.exist? @o[:table]
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?

   puts "Testing environment." unless @o[:q]
   bash("echo '' | #{@o[:r]} --vanilla",
	 "-r/--path-to-r must be executable. Is R installed?")
   bash("echo \"library('pROC')\" | #{@o[:r]} --vanilla",
	 "Please install the 'pROC' library for R first.")

   puts "Reading files." unless @o[:q]
   puts "  * loading alignment: #{@o[:aln]}." unless @o[:q]
   aln = Alignment.new
   aln.read_fasta @o[:aln]
   
   if @o[:reuse] and File.exist? @o[:table]
	 puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
	 puts "  * generating table: #{@o[:table]}." unless @o[:q]
	 blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
   end

   puts "Analyzing data." unless @o[:q]
   puts "  * computing windows." unless @o[:q]
   data = ROCData.new(@o[:table], aln, @o[:win])
   data.nucl = @o[:nucl]
   if @o[:refine]
	 puts "  * refining windows." unless @o[:q]
	 warn "Insufficient hits to refine results." unless
  data.refine! @o[:table]
   end
   puts "  * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
   data.save @o[:rocker]
end

#ebiFetch(db, ids, format, outfile = nil) ⇒ Object



54
55
56
57
58
# File 'lib/rocker/step/build.rb', line 54

def ebiFetch(db, ids, format, outfile=nil)
   url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
	 "#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
   self.restcall url, outfile
end

#filter!(data = nil) ⇒ Object

[ Filter ]


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/rocker/step/filter.rb', line 13

def filter!(data=nil)
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?
   raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
   raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
   
   # Read ROCker file
   if data.nil?
	 puts "Loading ROCker file: #{@o[:rocker]}." unless @o[:q]
	 data = ROCData.new @o[:rocker]
   end

   # Filter similarity search
   puts "Filtering similarity search: #{@o[:qblast]}." unless @o[:q]
   ih = File.open(@o[:qblast], "r")
   oh = File.open(@o[:oblast], "w")
   while ln = ih.gets
	 bh = BlastHit.new(ln, data.aln)
	 oh.print ln if not(bh.sfrom.nil?) and bh.bits >= data.win_at_col(bh.midpoint).thr
   end
   ih.close
   oh.close
end

#get_coords_from_gff3(genome_ids, pset, thread_id, json_file) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/rocker/step/build.rb', line 59

def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
   coords = {}
   i = 0
   genome_ids.each do |genome_id|
	 print "  * scanning #{(i+=1).ordinalize} genome out of " +
  "#{genome_ids.size} in first thread.  \r" if
  thread_id==0 and not @o[:q]
	 genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
	 if @o[:reuse] and File.size? genome_file
  ifh = File.open(genome_file, "r")
  doc = ifh.readlines.grep(/^[^#]/)
  ifh.close
	 else
  genome_file=nil unless @o[:noclean]
  doc = ebiFetch(:embl, [genome_id], :gff3,
     genome_file).split("\n").grep(/^[^#]/)
  if doc.first =~ /ERROR 12 No entries found/
     doc = ebiFetch(:emblconexp, [genome_id], :gff3,
 genome_file).split("\n").grep(/^[^#]/)
  end
	 end
	 doc.each do |ln|
  next if ln =~ /^#/
  r = ln.chomp.split /\t/
  next if r.size < 9
  prots = r[8].split(/;/).grep(
     /^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
  p = prots.compact.select{ |id| pset.ids.include? id }.first
  trans = r[8].split(/;/).grep(
     /^protein_id=/){ |pid| pid.split(/=/)[1] }
  t = trans.select{ |id| pset.tranids.include? id }.first
  next if p.nil? and t.nil?
  coords[ r[0].to_sym ] ||= []
  coords[ r[0].to_sym ] << {
     prot_id:	p,
     tran_id:	t,
     from:	r[3].to_i,
     to:	r[4].to_i,
     strand:	r[6]
  }
	 end
   end
   print "\n" if thread_id==0 and not @o[:q]
   ofh = File.open(json_file, "w")
   ofh.print({coords:coords}.to_json)
   ofh.close
end

#plot!Object

[ Search ]


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/rocker/step/plot.rb', line 16

def plot!
   raise "-k/--rocker is mandatory." if o[:rocker].nil?
   if @o[:table].nil?
	 raise "-t/--table is mandatory unless -b is provided." if
  @o[:blast].nil?
	 @o[:table] = "#{@o[:blast]}.table"
   end
   raise "-b/--blast is mandatory unless -t exists." if
	 @o[:blast].nil? and not File.exist? @o[:table]

   puts "Testing environment." unless @o[:q]
   bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be " +
	 "executable. Is R installed?"

   # Source files
   puts "Reading files." unless @o[:q]
   puts "  * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
   data = ROCData.new @o[:rocker]
   if File.exist? @o[:table]
	 puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
	 puts "  * generating table: #{@o[:table]}." unless @o[:q]
	 blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
   end

   # Matches (middle panel)
   puts "Plotting matches." unless @o[:q]
   extra = @o[:gformat]=="pdf" ? "" : ", units='in', res=300"
   @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
   data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, " +
	 "#{@o[:height]}#{extra});"
   data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
   some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
   data.rrun "par(mar=c(0,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), " +
	 "ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', " +
	 "ylab='Bit score', xaxs='i', xaxt='n');"
   data.rrun "noise <- runif(ncol(x),-.2,.2)"
   data.rrun "hit.col <- ifelse(x$V5==1, " +
	 "rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), " +
	 "rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"}))"
   data.rrun "hit.col[ x$V5==-1 ] <- " +
	 "rgb(0.722,0.722,0,#{@o[:transparency] ? ".2" : "1"})" if
	 @o[:tag_negatives]
   data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, lty=1, col=hit.col, " +
	 "length=0);"
   data.rrun "points(x$V6, x$V4+noise, col=hit.col, pch=19, cex=1/4);"
   
   # Windows (middle panel)
   puts "Plotting windows." unless @o[:q]
   if some_thr
	 data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
	 data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], " +
  "y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
   end
   data.rrun "legend('bottomright', legend=c('Match span'," +
	 "'Match mid-point','Reference (+)'," +
	 "#{"'Reference (-)'," if @o[:tag_negatives]}'Non-reference'), " +
	 "lwd=c(1,NA,1,1,1), pch=c(NA,19,19,19,19), ncol=5, bty='n', " +
	 "col=c('black','black','darkblue'," +
	 "#{"rgb(.722,.722,0)," if @o[:tag_negatives]}'darkred'))"

   # Alignment (top panel)
   puts "Plotting alignment." unless @o[:q]
   data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}), " +
	 "ylim=c(1,#{data.aln.seqs.size}), xlab='', ylab='Alignment', " +
	 "xaxs='i', xaxt='n', yaxs='i', yaxt='n', bty='n');"
   i = 0
   data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
   data.aln.seqs.values.each do |s|
      color = (s.aln.split(//).map do |c|
  c=="-" ? "'grey80'" :
     (@o[:sbj].include?(s.id) ? "'red'" :
     (@o[:color] ? "clr[#{c.ord-64}]" :
     "'black'"))
	 end.join(","))
	 data.rrun "rect((1:#{data.aln.cols-1})-0.5, " +
  "rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, " +
  "rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
	 i += 1
   end

   # Statistics (bottom panel)
   puts "Plotting statistics." unless @o[:q]
   data.rrun "par(mar=c(5,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}), " +
	 "ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") :
  @o[:ylim]}), xlab='Alignment position (amino acids)', " +
	 "ylab='Precision',xaxs='i');"
   if some_thr
	 sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
	 sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
	 ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
	 unless @o[:q]
  puts "  * sensitivity: #{sn}%"
  puts "  * specificity: #{sp}%"
  puts "  * accuracy: #{ac}%"
	 end
	 data.rrun "pos <- (w$V1+w$V2)/2"
	 if @o[:impact]
  data.rrun "lines(pos[!is.na(w$specificity)], " +
     "(w$specificity[!is.na(w$specificity)]-#{sp})*" +
 "w$tp[!is.na(w$specificity)]/sum(w$tp), " +
     "col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$sensitivity)], " +
     "(w$sensitivity[!is.na(w$sensitivity)]-#{sn})*" +
 "w$tn[!is.na(w$sensitivity)]/sum(w$tn), " +
     "col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$accuracy)], " +
     "(w$accuracy[!is.na(w$accuracy)]-#{ac})*" +
 "(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), " +
     "col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
	 else
  data.rrun "lines(pos[!is.na(w$specificity)], " +
     "w$specificity[!is.na(w$specificity)], col='darkred', " +
     "lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$sensitivity)], " +
     "w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', " +
     "lwd=2, t='o', cex=1/3, pch=19);"
  data.rrun "lines(pos[!is.na(w$accuracy)], " +
     "w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, " +
     "t='o', cex=1/3, pch=19);"
	 end
   end
   data.rrun "legend('bottomright', " +
	 "legend=c('Specificity','Sensitivity','Accuracy'), lwd=2, " +
	 "col=c('darkred','darkgreen','darkblue'), ncol=3, bty='n')"
   data.rrun "dev.off();"
end

#restcall(url, outfile = nil) ⇒ Object

[ Utilities ]


41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/rocker/step/build.rb', line 41

def restcall(url, outfile=nil)
   $stderr.puts "   # Calling: #{url}" if @o[:debug]
   response = RestClient::Request.execute(method: :get, url: url,
	 timeout: 600)
   raise "Unable to reach EBI REST client, error code " +
	 response.code.to_s + "." unless response.code == 200
   unless outfile.nil?
	 ohf = File.open(outfile, "w")
	 ohf.print response.to_s
	 ohf.close
   end
   response.to_s
end

#search!Object

[ Search ]


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/rocker/step/search.rb', line 15

def search!
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?
   raise "-i/--query is mandatory." if @o[:query].nil?

   # Check requirements
   puts "Testing environment." unless @o[:q]
   @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
   @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
   self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
   self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond

   # Run similarity search
   Dir.mktmpdir do |dir|
	 @o[:qblast] ||= "#{dir}/blast"
	 puts "Loading ROCker file: #{@o[:rocker]}." unless @o[:q]
	 data = ROCData.new @o[:rocker]
	 puts "Running similarity search." unless @o[:q]
	 puts "  * preparing database." unless @o[:q]
	 ofh = File.new("#{dir}/ref.fasta", "w")
	 ofh.print data.aln.to_seq_s
	 ofh.close
	 bash sprintf(@o[:makedbcmd], @o[:searchbins], 'prot', "#{dir}/ref.fasta", "#{dir}/ref")
	 puts "  * running similarity search." unless @o[:q]
	 bash sprintf(@o[:searchcmd], @o[:searchbins], 'blastx', @o[:query], "#{dir}/ref", @o[:qblast], @o[:thr])
	 self.filter! data
   end
end