Class: CRB_Blast::CRB_Blast

Inherits:
Object
  • Object
show all
Includes:
Which
Defined in:
lib/crb-blast/crb-blast.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(query, target, output = nil) ⇒ CRB_Blast

Returns a new instance of CRB_Blast.

Raises:

  • (IOError)


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/crb-blast/crb-blast.rb', line 28

def initialize query, target, output=nil
  raise IOError.new("File not found #{query}") if !File.exist?(query)
  raise IOError.new("File not found #{target}") if !File.exist?(target)
  @query = File.expand_path(query)
  @target = File.expand_path(target)
  if output.nil?
    #@working_dir = File.expand_path(File.dirname(query)) # no trailing /
    @working_dir = "."
  else
    @working_dir = File.expand_path(output)
    mkcmd = "mkdir #{@working_dir}"
    if !Dir.exist?(@working_dir)
      puts mkcmd
      mkdir = Cmd.new(mkcmd)
      mkdir.run
      if !mkdir.status.success?
        raise RuntimeError.new("Unable to create output directory")
      end
    end
  end
  @makedb_path = which('makeblastdb')
  raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
  @blastn_path = which('blastn')
  raise 'blastn was not in the PATH' if @blastn_path.empty?
  @tblastn_path = which('tblastn')
  raise 'tblastn was not in the PATH' if @tblastn_path.empty?
  @blastx_path = which('blastx')
  raise 'blastx was not in the PATH' if @blastx_path.empty?
  @blastp_path = which('blastp')
  raise 'blastp was not in the PATH' if @blastp_path.empty?
  @makedb_path = @makedb_path.first
  @blastn_path = @blastn_path.first
  @tblastn_path = @tblastn_path.first
  @blastx_path = @blastx_path.first
  @blastp_path = @blastp_path.first
end

Instance Attribute Details

#missedObject

Returns the value of attribute missed.



24
25
26
# File 'lib/crb-blast/crb-blast.rb', line 24

def missed
  @missed
end

#query_is_protObject

Returns the value of attribute query_is_prot.



25
26
27
# File 'lib/crb-blast/crb-blast.rb', line 25

def query_is_prot
  @query_is_prot
end

#query_nameObject

Returns the value of attribute query_name.



23
24
25
# File 'lib/crb-blast/crb-blast.rb', line 23

def query_name
  @query_name
end

#query_resultsObject

Returns the value of attribute query_results.



26
27
28
# File 'lib/crb-blast/crb-blast.rb', line 26

def query_results
  @query_results
end

#reciprocalsObject

Returns the value of attribute reciprocals.



23
24
25
# File 'lib/crb-blast/crb-blast.rb', line 23

def reciprocals
  @reciprocals
end

#target_is_protObject

Returns the value of attribute target_is_prot.



25
26
27
# File 'lib/crb-blast/crb-blast.rb', line 25

def target_is_prot
  @target_is_prot
end

#target_nameObject

Returns the value of attribute target_name.



23
24
25
# File 'lib/crb-blast/crb-blast.rb', line 23

def target_name
  @target_name
end

#target_resultsObject

Returns the value of attribute target_results.



26
27
28
# File 'lib/crb-blast/crb-blast.rb', line 26

def target_results
  @target_results
end

#working_dirObject

Returns the value of attribute working_dir.



26
27
28
# File 'lib/crb-blast/crb-blast.rb', line 26

def working_dir
  @working_dir
end

Instance Method Details

#clear_memoryObject



472
473
474
475
476
477
# File 'lib/crb-blast/crb-blast.rb', line 472

def clear_memory
  # running lots of jobs at the same time was keeping a lot of stuff in
  # memory that you might not want so this empties out those big hashes.
  @query_results = nil
  @target_results = nil
end

#find_reciprocalsObject

fills @reciprocals with strict reciprocal hits from the blast results



368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# File 'lib/crb-blast/crb-blast.rb', line 368

def find_reciprocals
  if File.exist?("#{@working_dir}/reciprocal_hits.txt")
    # puts "reciprocal output already exists"
  else
    @reciprocals = Hash.new
    @missed = Hash.new
    @evalues = []
    @longest = 0
    hits = 0
    @query_results.each_pair do |query_id, list_of_hits|
      list_of_hits.each_with_index do |target_hit, query_index|
        if @target_results.has_key?(target_hit.target)
          list_of_hits_2 = @target_results[target_hit.target]
          list_of_hits_2.each_with_index do |query_hit2, target_index|
            if query_index == 0 && target_index == 0 &&
               query_id == query_hit2.target
              e = target_hit.evalue.to_f
              e = 1e-200 if e==0
              e = -Math.log10(e)
              if !@reciprocals.key?(query_id)
                @reciprocals[query_id] = []
              end
              @reciprocals[query_id] << target_hit
              hits += 1
              @longest = target_hit.alnlen  if target_hit.alnlen > @longest
              @evalues << {:e => e, :length => target_hit.alnlen}
            elsif query_id == query_hit2.target
              if !@missed.key?(query_id)
                @missed[query_id] = []
              end
              @missed[query_id] << target_hit
            end
          end
        end
      end
    end
  end
  return hits
end

#find_secondariesObject

Learns the evalue cutoff based on the length of the sequence Finds hits that have a lower evalue than this cutoff



410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
# File 'lib/crb-blast/crb-blast.rb', line 410

def find_secondaries

  if File.exist?("#{@working_dir}/reciprocal_hits.txt")
    # puts "reciprocal output already exists"
  else
    length_hash = Hash.new
    fitting = Hash.new
    @evalues.each do |h|
      length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
      length_hash[h[:length]] << h
    end

    (10..@longest).each do |centre|
      e = 0
      count = 0
      s = centre*0.1
      s = s.to_i
      s = 5 if s < 5
      (-s..s).each do |side|
        if length_hash.has_key?(centre+side)
          length_hash[centre+side].each do |point|
            e += point[:e]
            count += 1
          end
        end
      end
      if count>0
        mean = e/count
        fitting[centre] = mean
      end
    end
    hits = 0
    @missed.each_pair do |id, list|
      list.each do |hit|
        l = hit.alnlen.to_i
        e = hit.evalue
        e = 1e-200 if e==0
        e = -Math.log10(e)
        if fitting.has_key?(l)
          if e >= fitting[l]
            if !@reciprocals.key?(id)
              @reciprocals[id] = []
              found=false
              @reciprocals[id].each do |existing_hit|
                if existing_hit.query == hit.query &&
                  existing_hit.target == hit.target
                 found=true
                end
              end
              if !found
                @reciprocals[id] << hit
                hits += 1
              end
            end
          end
        end
      end
    end
  end
  return hits
end

#has_reciprocal?(contig) ⇒ Boolean

Returns:

  • (Boolean)


509
510
511
512
# File 'lib/crb-blast/crb-blast.rb', line 509

def has_reciprocal? contig
  return true if @reciprocals.has_key?(contig)
  return false
end

#load_outputsObject

Load the two BLAST output files and store the hits in a hash



331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# File 'lib/crb-blast/crb-blast.rb', line 331

def load_outputs
  if File.exist?("#{@working_dir}/reciprocal_hits.txt")
    # puts "reciprocal output already exists"
  else
    @query_results = Hash.new
    @target_results = Hash.new
    q_count=0
    t_count=0
    if !File.exists?("#{@output1}")
      raise RuntimeError.new("can't find #{@output1}")
    end
    if !File.exists?("#{@output2}")
      raise RuntimeError.new("can't find #{@output2}")
    end
    if File.exists?("#{@output1}") and File.exists?("#{@output2}")
      File.open("#{@output1}").each_line do |line|
        cols = line.chomp.split("\t")
        hit = Hit.new(cols)
        @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
        @query_results[hit.query] << hit
        q_count += 1
      end
      File.open("#{@output2}").each_line do |line|
        cols = line.chomp.split("\t")
        hit = Hit.new(cols)
        @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
        @target_results[hit.query] << hit
        t_count += 1
      end
    else
      raise "need to run blast first"
    end
  end
  [q_count, t_count]
end

#makedbObject

makes a blast database from the query and the target



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/crb-blast/crb-blast.rb', line 68

def makedb
  # only scan the first few hundred entries
  n = 100
  # check if the query is a nucl or prot seq
  query_file = Bio::FastaFormat.open(@query)
  count_p=0
  count=0
  query_file.take(n).each do |entry|
    count_p += 1 if entry.isProt?
    count += 1
  end
  if count_p > count*0.9
    @query_is_prot = true
  else
    @query_is_prot = false
  end

  # check if the target is a nucl or prot seq
  target_file = Bio::FastaFormat.open(@target)
  count_p=0
  count=0
  target_file.take(n).each do |entry|
    count_p += 1 if entry.isProt?
    count += 1
  end
  if count_p > count*0.9
    @target_is_prot = true
  else
    @target_is_prot = false
  end
  # construct the output database names
  @query_name = File.basename(@query).split('.')[0..-2].join('.')
  @target_name = File.basename(@target).split('.')[0..-2].join('.')

  # check if the databases already exist in @working_dir
  make_query_db_cmd = "#{@makedb_path} -in #{@query}"
  make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
  make_query_db_cmd << " -dbtype prot " if @query_is_prot
  make_query_db_cmd << " -title #{query_name} "
  make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
  db_query = "#{query_name}.nsq" if !@query_is_prot
  db_query = "#{query_name}.psq" if @query_is_prot
  if !File.exists?("#{@working_dir}/#{db_query}")
    make_db = Cmd.new(make_query_db_cmd)
    make_db.run
    if !make_db.status.success?
      raise RuntimeError.new("BLAST Error creating database")
    end
  end

  make_target_db_cmd = "#{@makedb_path} -in #{@target}"
  make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
  make_target_db_cmd << " -dbtype prot " if @target_is_prot
  make_target_db_cmd << " -title #{target_name} "
  make_target_db_cmd << " -out #{@working_dir}/#{target_name}"

  db_target = "#{target_name}.nsq" if !@target_is_prot
  db_target = "#{target_name}.psq" if @target_is_prot
  if !File.exists?("#{@working_dir}/#{db_target}")
    make_db = Cmd.new(make_target_db_cmd)
    make_db.run
    if !make_db.status.success?
      raise RuntimeError.new("BLAST Error creating database")
    end
  end
  @databases = true
  [@query_name, @target_name]
end

#run(evalue = 1e-5, threads = 1, split = true) ⇒ Object



479
480
481
482
483
484
485
# File 'lib/crb-blast/crb-blast.rb', line 479

def run evalue=1e-5, threads=1, split=true
  makedb
  run_blast evalue, threads, split
  load_outputs
  find_reciprocals
  find_secondaries
end

#run_blast(evalue, threads, split) ⇒ Object

Construct BLAST output file name and run blast with multiple chunks or with multiple threads

Parameters:

  • evalue (Float)

    The evalue cutoff to use with BLAST

  • threads (Integer)

    The number of threads to run

  • split (Boolean)

    If the fasta files should be split into chunks



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/crb-blast/crb-blast.rb', line 143

def run_blast(evalue, threads, split)
  if @databases
    @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
    @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
    if @query_is_prot
      if @target_is_prot
        bin1 = "#{@blastp_path} "
        bin2 = "#{@blastp_path} "
      else
        bin1 = "#{@tblastn_path} "
        bin2 = "#{@blastx_path} "
      end
    else
      if @target_is_prot
        bin1 = "#{@blastx_path} "
        bin2 = "#{@tblastn_path} "
      else
        bin1 = "#{@blastn_path} "
        bin2 = "#{@blastn_path} "
      end
    end
    if split and threads > 1
      run_blast_with_splitting evalue, threads, bin1, bin2
    else
      run_blast_with_threads evalue, threads, bin1, bin2
    end
    return true
  else
    return false
  end
end

#run_blast_with_splitting(evalue, threads, bin1, bin2) ⇒ Object

Run BLAST by splitting the input into multiple chunks and using 1 thread for each chunk

Parameters:

  • evalue (Float)

    The evalue cutoff to use with BLAST

  • threads (Integer)

    The number of threads to run

  • bin1 (String)
  • bin2 (String)


218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/crb-blast/crb-blast.rb', line 218

def run_blast_with_splitting evalue, threads, bin1, bin2
  # puts "running blast by splitting input into #{threads} pieces"
  blasts=[]
  files = split_input(@query, threads)
  files.threach(threads) do |thread|
    cmd1 = "#{bin1} -query #{thread} -db #{@working_dir}/#{@target_name} "
    cmd1 << " -out #{thread}.blast -evalue #{evalue} "
    cmd1 << " -outfmt \"6 std qlen slen\" "
    cmd1 << " -max_target_seqs 50 "
    cmd1 << " -num_threads 1"
    if !File.exists?("#{thread}.blast")
      blast1 = Cmd.new(cmd1)
      blast1.run
      if !blast1.status.success?
        raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
      end
    end
    blasts << "#{thread}.blast"
  end
  cat_cmd = "cat "
  cat_cmd << blasts.join(" ")
  cat_cmd << " > #{@output1}"
  catting = Cmd.new(cat_cmd)
  catting.run
  if !catting.status.success?
    raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
  end
  files.each do |file|
    File.delete(file) if File.exist?(file)
  end
  blasts.each do |b|
    File.delete(b) # delete intermediate blast output files
  end

  blasts=[]
  files = split_input(@target, threads)
  files.threach(threads) do |thread|
    cmd2 = "#{bin2} -query #{thread} -db #{@working_dir}/#{@query_name} "
    cmd2 << " -out #{thread}.blast -evalue #{evalue} "
    cmd2 << " -outfmt \"6 std qlen slen\" "
    cmd2 << " -max_target_seqs 50 "
    cmd2 << " -num_threads 1"
    if !File.exists?("#{thread}.blast")
      blast2 = Cmd.new(cmd2)
      blast2.run
      if !blast2.status.success?
        raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
      end
    end
    blasts << "#{thread}.blast"
  end
  cat_cmd = "cat "
  cat_cmd << blasts.join(" ")
  cat_cmd << " > #{@output2}"
  catting = Cmd.new(cat_cmd)
  catting.run
  if !catting.status.success?
    raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
  end
  files.each do |file|
    File.delete(file) if File.exist?(file)
  end
  blasts.each do |b|
    File.delete(b) # delete intermediate blast output files
  end

end

#run_blast_with_threads(evalue, threads, bin1, bin2) ⇒ Object

Run BLAST using its own multithreading

Parameters:

  • evalue (Float)

    The evalue cutoff to use with BLAST

  • threads (Integer)

    The number of threads to run

  • bin1 (String)
  • bin2 (String)


181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/crb-blast/crb-blast.rb', line 181

def run_blast_with_threads evalue, threads, bin1, bin2
  # puts "running blast with #{threads} threads"
  cmd1 = "#{bin1} -query #{@query} -db #{@working_dir}/#{@target_name} "
  cmd1 << " -out #{@output1} -evalue #{evalue} "
  cmd1 << " -outfmt \"6 std qlen slen\" "
  cmd1 << " -max_target_seqs 50 "
  cmd1 << " -num_threads #{threads}"

  cmd2 = "#{bin2} -query #{@target} -db #{@working_dir}/#{@query_name} "
  cmd2 << " -out #{@output2} -evalue #{evalue} "
  cmd2 << " -outfmt \"6 std qlen slen\" "
  cmd2 << " -max_target_seqs 50 "
  cmd2 << " -num_threads #{threads}"
  if !File.exist?("#{@output1}")
    blast1 = Cmd.new(cmd1)
    blast1.run
    if !blast1.status.success?
      raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
    end
  end

  if !File.exist?("#{@output2}")
    blast2 = Cmd.new(cmd2)
    blast2.run
    if !blast2.status.success?
      raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
    end
  end
end

#sizeObject



487
488
489
490
491
492
493
494
495
# File 'lib/crb-blast/crb-blast.rb', line 487

def size
  hits=0
  @reciprocals.each_pair do |key, list|
    list.each do |hit|
      hits += 1
    end
  end
  hits
end

#split_input(filename, pieces) ⇒ Object

Split a fasta file in pieces

Parameters:

  • filename (String)
  • pieces (Integer)


290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/crb-blast/crb-blast.rb', line 290

def split_input filename, pieces
  input = {}
  name = nil
  seq=""
  File.open(filename).each_line do |line|
    if line =~ /^>(.*)$/
      if name
        input[name]=seq
        seq=""
      end
      name = $1
    else
      seq << line.chomp
    end
  end
  input[name]=seq
  # construct list of output file handles
  outputs=[]
  output_files=[]
  pieces.times do |n|
    outfile = "#{filename}_chunk_#{n}.fasta"
    outfile = File.expand_path(outfile)
    outputs[n] = File.open("#{outfile}", "w")
    output_files[n] = "#{outfile}"
  end
  # write sequences
  count=0
  input.each_pair do |name, seq|
    outputs[count].write(">#{name}\n")
    outputs[count].write("#{seq}\n")
    count += 1
    count %= pieces
  end
  outputs.each do |out|
    out.close
  end
  output_files
end

#write_outputObject



497
498
499
500
501
502
503
504
505
506
507
# File 'lib/crb-blast/crb-blast.rb', line 497

def write_output
  s=""
  unless @reciprocals.nil?
    @reciprocals.each_pair do |query_id, hits|
      hits.each do |hit|
        s << "#{hit}\n"
      end
    end
    File.open("#{@working_dir}/reciprocal_hits.txt", "w") {|f| f.write s }
  end
end