Class: BigSimon::Runners

Inherits:
Object
  • Object
show all
Defined in:
lib/big_simon/runners.rb

Class Method Summary collapse

Class Method Details

.heatmaps(exe, indir, outdir) ⇒ Object



403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# File 'lib/big_simon/runners.rb', line 403

def self.heatmaps exe, indir, outdir
  FileUtils.mkdir_p outdir

  fnames = Dir.glob("#{indir}/scores*.txt").map do |in_fname|
    extname  = File.extname in_fname
    basename = File.basename in_fname, extname

    out_fname = File.join outdir, "#{basename}.heatmap.pdf"

    [File.absolute_path(in_fname), File.absolute_path(out_fname)]
  end


  rcode_str = BigSimon::Utils.rcode fnames

  Object::File.open(File.join(outdir, "RCODE.r"), "w") do |f|
    f.puts rcode_str
    f.fsync # ensure no data is buffered


    cmd = "#{exe} #{f.path}"
    Process.run_and_time_it! "Drawing heatmaps", cmd
  end

  out_fnames = fnames.map(&:last)
end

.homology(vir_dir, host_dir, outdir, threads, all_seq_lengths) ⇒ Object

TODO:

assert that fname thing matches sequence ID name.

Note:

I will make the specified outdir if it doesn’t exist.

Note:

Assumes that the files end with *.fa

Note:

Assumes that the file names match the IDs. This SHOULD be taken care of by the big_simon program.

For scoring homology-ness, I just sum the bitscore for all significant hits for all genomes.



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/big_simon/runners.rb', line 223

def self.homology vir_dir, host_dir, outdir, threads, all_seq_lengths
  FileUtils.mkdir_p outdir

  host_orfs          = File.join outdir, "host_orfs.homology"
  host_orfs_blast_db = host_orfs + ".blast_db.homology"

  # Call ORFs on Hosts
  cmd = "cat #{host_dir}/*.fa | #{BigSimon::PRODIGAL} " \
  "-d #{host_orfs} " \
  "> /dev/null"

  Process.run_and_time_it! "Predicting host ORFs", cmd

  # Make blast db's for the host genes.
  cmd = "#{BigSimon::MAKEBLASTDB} " \
  "-in #{host_orfs} " \
  "-out #{host_orfs_blast_db} " \
  "-dbtype nucl"

  Process.run_and_time_it! "Making host blast db", cmd

  vir_genome_fnames = Dir.glob(vir_dir + "/*.fa")

  blast_info = Parallel.map(vir_genome_fnames, in_processes: threads) do |vir_genome_fname|
    vir_orfs      = File.join outdir, File.basename(vir_genome_fname) + ".vir_orfs.homology"
    blast_results = File.join outdir, File.basename(vir_genome_fname) + ".blast_results.homology"

    # this will be used as a viral ID.
    vir_simple_fname              = File.basename vir_genome_fname, ".fa"
    blast_table                   = {}
    blast_table[vir_simple_fname] = Hash.new 0

    # Call ORFs on the virus.
    cmd = "#{BigSimon::PRODIGAL} " \
    "-d #{vir_orfs} -p meta -i #{vir_genome_fname} " \
    "> /dev/null"

    Process.run_and_time_it! "Predicting ORFs for #{File.basename vir_genome_fname}", cmd

    # Blast the ORFs against genomes.
    cmd = "#{BigSimon::BLASTN} -query #{vir_orfs} -db #{host_orfs_blast_db} -outfmt 6 -evalue 0.01 -word_size 11 -out #{blast_results}"
    Process.run_and_time_it! "Blasting ORFs for #{File.basename vir_genome_fname}", cmd

    # Remove ORFs file
    FileUtils.rm vir_orfs if File.exist? vir_orfs

    Rya::AbortIf.logger.info { "Parsing #{blast_results}" }
    # Parse the blast.

    File.open(blast_results, "rt").each_line do |line|
      ary = line.chomp.split "\t"

      # The .sub() is to remove the annotation that prodigal gives.
      vir_id  = ary[0].sub(/_[0-9]+$/, "")
      host_id = ary[1].sub(/_[0-9]+$/, "")
      score   = ary[11].to_f

      Rya::AbortIf.assert blast_table.has_key?(vir_id), "blast_table: got #{vir_id} should have been #{vir_simple_fname}"

      Rya::AbortIf.assert all_seq_lengths[vir_id]
      Rya::AbortIf.assert all_seq_lengths[host_id]

      combined_seq_length = all_seq_lengths[vir_id] + all_seq_lengths[host_id]
      score = score / combined_seq_length.to_f * 1000


      blast_table[vir_id][host_id] += score
    end

    # Remove blast file
    # FileUtils.rm_r blast_results if File.exist? blast_results

    # Again, we're assuming the input is .fa, which the big_simon program SHOULD ensure.  TODO check these things with assertions.
    simple_vir_name = File.basename vir_genome_fname.sub(/.fa$/, "")

    [simple_vir_name, blast_table]
  end

  collated_blast_table = {}
  host_simple_names    = Dir.glob(host_dir + "/*.fa").map { |fname| File.basename(fname, ".fa") }

  Rya::AbortIf.assert host_simple_names.length == host_simple_names.uniq.length, "host simple names are not unique"

  Rya::AbortIf.logger.info { "Collating blast results" }

  # Get max score
  max_score = -1
  blast_info.each do |_, blast_table|
    blast_table.each do |vir_id, host_scores|
      this_max  = host_scores.values.max || -1 # sometimes there are no hits at all

      max_score = this_max if this_max > max_score
    end
  end
  Rya::AbortIf.assert max_score > -1, "didn't get any scores"


  klass = Class.new.extend Rya::CoreExtensions::Math
  blast_info.each do |simple_vir_name, blast_table|
    blast_table.each do |vir_id, host_scores|
      collated_blast_table[vir_id] = []

      host_simple_names.each do |host_id|


        combined_seq_length = all_seq_lengths[vir_id] + all_seq_lengths[host_id]
        scaled_score = klass.scale host_scores[host_id].to_f, 0, max_score, 1, 0

        host_table = { host: host_id, score: host_scores[host_id], scaled_score: scaled_score }
        collated_blast_table[vir_id] << host_table
      end
    end
  end

  pp collated_blast_table

  collated_blast_table
end

.mummer(exe, vir_dir, host_dir, outdir, threads, all_seq_lengths) ⇒ Object

Note:

To match the other things, you’d like them to be key’d on the file name.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/big_simon/runners.rb', line 9

def self.mummer exe, vir_dir, host_dir, outdir, threads, all_seq_lengths
  klass = Class.new.extend Rya::CoreExtensions::Math
  FileUtils.mkdir_p outdir

  mummer_outfname = File.join outdir, "mummer_out.txt"

  virus_fnames = Dir.glob(vir_dir + "/*")
  host_fnames  = Dir.glob(host_dir + "/*")

  hit_table = {}

  Tempfile.open do |vir_f|
    Tempfile.open do |host_f|
      virus_fnames.each do |fname|
        Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"

        Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
          # id needs to be the file name
          new_id = File.basename fname.sub(/.fa$/, "")

          hit_table[new_id] = {}

          vir_f.puts ">#{new_id}\n#{rec.seq}"

          vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
        end
      end

      host_fnames.each do |fname|
        Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"

        Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
          new_id = File.basename fname.sub(/.fa$/, "")

          # Add this host to each virus in the hit_table
          hit_table.each do |virus, host_table|
            host_table[new_id] = 0 # set it to defualt score of 0
          end

          host_f.puts ">#{new_id}\n#{rec.seq}"
          host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
        end
      end

      vir_f.fsync
      host_f.fsync

      # -k 3 index every third position in reference (broken now, bug in mummer)
      # -n -k 3 -threads 3
      # -n match only A C T G
      cmd = "mummer -n -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
      Process.run_and_time_it! "MUMMER", cmd
    end
  end

  virus             = nil
  overall_max_score = 0
  File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
    line.chomp!

    unless line.empty?
      if line.start_with? ">"
        virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip

        # It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)

        Rya::AbortIf.assert hit_table.has_key?(virus)
        # unless hit_table.has_key? virus
        #   hit_table[virus] = {}
        # end
      else
        ary = line.strip.split " "

        host  = ary[0].sub(/___reverse$/, "").strip

        Rya::AbortIf.assert hit_table[virus].has_key?(host)

        Rya::AbortIf.assert all_seq_lengths[virus]
        Rya::AbortIf.assert all_seq_lengths[host]

        combined_seq_length = all_seq_lengths[virus] + all_seq_lengths[host]

        score = ary[3].to_i / combined_seq_length * 1000


        # unless hit_table[virus].has_key? host
        #   hit_table[virus][host] = -1
        # end

        # We only want the longest hit.
        hit_table[virus][host] = score if score > hit_table[virus][host]

        # Track the overall max for scaling.
        overall_max_score      = score if score > overall_max_score
      end
    end
  end

  results_table = {}

  min, max, from, to = 0, overall_max_score, 1, 0

  hit_table.each do |virus, host_table|
    results_table[virus] = []

    host_table.each do |host, score|
      scaled_score = klass.scale score, min, max, from, to

      results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
    end
  end

  results_table
end

.mummer2(exe, vir_dir, host_dir, outdir, threads) ⇒ Object

TODO:

Also do the reverse of each genome in case it’s a contig.

This one’s a bit different as it parses as well and returns original names.



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/big_simon/runners.rb', line 126

def self.mummer2 exe, vir_dir, host_dir, outdir, threads
  klass = Class.new.extend Rya::CoreExtensions::Math
  FileUtils.mkdir_p outdir

  # TODO put these all in one file then do it?

  results = {}

  # Takes names in files and puts them to the file names
  name_map = {}

  Dir.glob(vir_dir + "/*").each do |vir_fname|
    this_virus_scores = []
    virus             = nil

    Dir.glob(host_dir + "/*").each do |host_fname|
      vir_base  = File.basename vir_fname
      host_base = File.basename host_fname
      outfname  = File.join outdir, "#{vir_base}___#{host_base}.mummer"

      # -l is min length of a match TODO pull this into a const
      # -F to force 4 columns
      cmd = "#{exe} -F " \
            "-maxmatch " \
            "-l 15 " \
            "#{host_fname} " \
            "#{vir_fname} " \
            "> #{outfname}"

      Process.run_and_time_it! "Calculating matches", cmd

      # Note there should only be one '>' per file here.
      host  = nil
      score = 0
      File.open(outfname, "rt").each_line.with_index do |line, idx|
        if idx.zero?
          this_virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip

          Rya::AbortIf::abort_unless(this_virus == virus, "OOPS") if virus

          virus ||= this_virus
        else
          ary = line.chomp.strip.split(" ")
          Rya::AbortIf.abort_unless ary.count == 4, "Problem parsing #{outfname} (mummer output)"

          host  = ary[0].sub(/___reverse$/, "").strip
          len   = ary[3].to_i

          score = len if len > score
        end
      end

      this_virus_scores << score

      unless results.has_key? virus
        results[virus] = []
      end

      results[virus] << { host: host, score: score, scaled_score: nil }

      FileUtils.rm outfname
    end

    # This was the original scaling, i.e. per virus
    # min = 0 # this_virus_scores.min # Technically, this should range from 0 to 15.  Any data missing from this table would give a zero.  TODO we don't actually account for this though.
    # max = this_virus_scores.max
    # from = 1
    # to = 0
    #
    # results[virus].each do |host_table|
    #   host_table[:scaled_score] = klass.scale host_table[:score], min, max, from, to
    # end
  end

  all_scores = []
  results.each do |virus, host_tables|
    all_scores << host_tables.map { |table| table[:score] }
  end

  all_scores.flatten!
  max = all_scores.max

  results.each do |virus, host_tables|
    host_tables.each do |host_table|
      host_table[:scaled_score] = klass.scale host_table[:score], 0, max, 1, 0
    end
  end

  results
end

.vir_host_matcher(exe, vir_dir, host_dir, outdir) ⇒ Object



342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# File 'lib/big_simon/runners.rb', line 342

def self.vir_host_matcher exe, vir_dir, host_dir, outdir
  FileUtils.mkdir_p outdir

  cmd = "python #{exe} " \
  "-v #{vir_dir} " \
  "-b #{host_dir} " \
  "-o #{outdir} " \
  "-d 1" # only compute d2star dissimilarity

  Process.run_and_time_it! "Computing d2star dissimilarity", cmd

  tmp_dir = File.join outdir, "tmp"
  FileUtils.rm_r tmp_dir if Dir.exist? tmp_dir

  bad_files = %w[d2star_k6_main.html hostTaxa.txt_new.txt]
  bad_files.each do |fname|
    path = File.join outdir, fname

    FileUtils.rm path if File.exist? path
  end

  outf     = File.join outdir, "d2star_k6.csv"
  new_outf = File.join outdir, "vir_host_matcher.txt"
  FileUtils.mv outf, new_outf

  new_outf
end

.wish(exe, vir_dir, host_dir, outdir, threads) ⇒ Object

Runs the WIsH program

Raises:

  • (AbortIf::Exit)

    if commands fail



373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
# File 'lib/big_simon/runners.rb', line 373

def self.wish exe, vir_dir, host_dir, outdir, threads
  model_dir = File.join outdir, "model"

  FileUtils.mkdir_p model_dir

  build_model = "#{exe} " \
  "-t #{threads} " \
  "-c build " \
  "-g #{host_dir} " \
  "-m #{model_dir}"

  predict = "#{exe} " \
  "-t #{threads} " \
  "-c predict " \
  "-g #{vir_dir} " \
  "-m #{model_dir} " \
  "-r #{outdir}"

  Process.run_and_time_it! "Building model", build_model
  Process.run_and_time_it! "Predicting host", predict

  FileUtils.rm_r model_dir if Dir.exist? model_dir

  outf     = File.join outdir, "llikelihood.matrix"
  new_outf = File.join outdir, "wish.txt"
  FileUtils.mv outf, new_outf

  new_outf
end