Class: MiGA::Cli::Action::Doctor

Inherits:
MiGA::Cli::Action show all
Includes:
Base
Defined in:
lib/miga/cli/action/doctor.rb,
lib/miga/cli/action/doctor/base.rb

Defined Under Namespace

Modules: Base

Constant Summary collapse

@@OPERATIONS =
{
  status: ['status', 'Update metadata status of all datasets'],
  db: ['databases', 'Check integrity of database files'],
  bidir: ['bidirectional', 'Check distances are bidirectional'],
  dist: ['distances', 'Check distance summary tables'],
  files: ['files', 'Check for outdated files'],
  cds: ['cds', 'Check for gzipped genes and proteins'],
  ess: ['essential-genes', 'Check for outdated essential genes'],
  mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
  start: ['start', 'Check for lingering .start files'],
  tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
}

Constants included from MiGA

MiGA::CITATION, VERSION, VERSION_DATE, VERSION_NAME

Instance Attribute Summary

Attributes inherited from MiGA::Cli::Action

#cli

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Base

#check_dist_eval, #check_dist_fix, #check_dist_recompute, #check_sqlite3_database, #each_database_file, #read_bidirectional, #save_bidirectional, #saved_targets

Methods inherited from MiGA::Cli::Action

#complete, #empty_action, #initialize, #launch, load, #name

Methods inherited from MiGA

CITATION, CITATION_ARRAY, DEBUG, DEBUG_OFF, DEBUG_ON, DEBUG_TRACE_OFF, DEBUG_TRACE_ON, FULL_VERSION, LONG_VERSION, VERSION, VERSION_DATE, #advance, debug?, debug_trace?, initialized?, #like_io?, #num_suffix, rc_path, #result_files_exist?, #say

Methods included from MiGA::Common::Path

#root_path, #script_path

Methods included from MiGA::Common::Format

#clean_fasta_file, #seqs_length, #tabulate

Methods included from MiGA::Common::Net

#download_file_ftp, #known_hosts, #remote_connection

Methods included from MiGA::Common::SystemCall

#run_cmd_opts

Constructor Details

This class inherits a constructor from MiGA::Cli::Action

Class Method Details

.OPERATIONSObject

All supported operations



58
59
60
# File 'lib/miga/cli/action/doctor.rb', line 58

def OPERATIONS
  @@OPERATIONS
end

Instance Method Details

#check_bidir(cli) ⇒ Object

Perform bidirectional operation with MiGA::Cli cli



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/miga/cli/action/doctor.rb', line 117

def check_bidir(cli)
  cli.say 'Checking if reference distances are bidirectional'
  project = cli.load_project
  ref_ds = project.each_dataset.select(&:ref?)
  ref_names = ref_ds.map(&:name)
  n = ref_ds.size

  # Read data first (threaded)
  tmp = File.join(project.path, 'doctor-bidirectional.tmp')
  FileUtils.mkdir_p(tmp)
  MiGA::Parallel.process(cli[:threads]) do |thr|
    file = File.join(tmp, "#{thr}.json")
    fh = File.open(file, 'w')
    [:aai, :ani].each do |metric|
      fh.puts "# #{metric}"
      ref_ds.each_with_index do |ds, idx|
        if idx % cli[:threads] == thr
          cli.advance('Reading:', idx + 1, n, false) if thr == 0
          row = read_bidirectional(ds, metric)
          fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
        end
      end
    end
    fh.puts '# end'
    fh.flush # necessary for large threaded runs
    fh.close
    if thr == 0
      cli.advance('Reading:', n, n, false)
      cli.say
    end
  end

  # Merge pieces per thread
  dist = { aai: {}, ani: {} }
  cli[:threads].times do |i|
    cli.advance('Merging:', i + 1, cli[:threads], false)
    file = File.join(tmp, "#{i}.json")
    File.open(file, 'r') do |fh|
      metric = nil
      fh.each do |ln|
        qry, row = ln.chomp.split(' ', 2)
        if qry == '#'
          metric = row.to_sym
        else
          raise "Unrecognized metric: #{metric}" unless dist[metric]
          JSON.parse(row).each do |sbj, val|
            dist[metric][qry] ||= {}
            if dist[metric][sbj]&.include?(qry)
              dist[metric][sbj].delete(qry) # Already bidirectional
            else
              dist[metric][qry][sbj] = val
            end
          end
        end
      end
      raise "Incomplete thread dump: #{file}" unless metric == :end
    end
  end
  cli.say
  FileUtils.rm_rf(tmp)

  # Write missing values (threaded)
  MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
    cli.advance('Datasets:', idx + 1, n, false) if thr == 0
    save_bidirectional(ds, dist)
  end
  cli.say
end

#check_cds(cli) ⇒ Object

Perform cds operation with MiGA::Cli cli



228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/miga/cli/action/doctor.rb', line 228

def check_cds(cli)
  cli.say 'Looking for unzipped genes or proteins'
  n, k = cli.load_project.dataset_names.size, 0
  cli.load_project.each_dataset do |d|
    cli.advance('Datasets:', k += 1, n, false)
    res = d.result(:cds) or next
    changed = false
    i[genes proteins gff3 gff2 tab].each do |f|
      file = res.file_path(f) or next
      if file !~ /\.gz/
        cli.say "  > Gzipping #{d.name} #{f}   "
        run_cmd(['gzip', '-9', file])
        changed = true
      end
    end
    if changed
      d.add_result(:cds, true, force: true)
      sr = d.result(:stats) and sr.remove!
    end
  end
  cli.say
end

#check_db(cli) ⇒ Object

Perform databases operation with MiGA::Cli cli



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/miga/cli/action/doctor.rb', line 85

def check_db(cli)
  cli.say 'Checking integrity of databases'
  p = cli.load_project
  n = p.dataset_names.size
  (0 .. cli[:threads] - 1).map do |i|
    Process.fork do
      k = 0
      p.each_dataset do |d|
        k += 1
        cli.advance('Datasets:', k, n, false) if i == 0
        next unless k % cli[:threads] == i
        each_database_file(d) do |db_file, metric, result, _rank|
          check_sqlite3_database(db_file, metric) do
            cli.say(
              "  > Removing malformed database from #{d.name}:#{result}   "
            )
            File.unlink(db_file)
            r = d.result(result) or next
            [r.path(:done), r.path].each do |f|
              File.unlink(f) if File.exist?(f)
            end
          end
        end
      end
    end
  end
  Process.waitall
  cli.say
end

#check_dist(cli) ⇒ Object

Perform distances operation with MiGA::Cli cli



188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/miga/cli/action/doctor.rb', line 188

def check_dist(cli)
  p = cli.load_project
  i[ani aai].each do |dist|
    res = p.result("#{dist}_distances")
    next if res.nil?

    cli.say "Checking #{dist} table for consistent datasets"
    notok, fix = check_dist_eval(cli, p, res)
    check_dist_fix(cli, p, fix)
    check_dist_recompute(cli, res, notok)
  end
end

#check_ess(cli) ⇒ Object

Perform essential-genes operation with MiGA::Cli cli



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/miga/cli/action/doctor.rb', line 253

def check_ess(cli)
  cli.say 'Looking for outdated essential genes'
  cli.load_project.each_dataset do |d|
    res = d.result(:essential_genes)
    next if res.nil?

    dir = res.file_path(:collection)
    if dir.nil? || outdated_fastaai_ess(res)
      cli.say "  > Removing #{d.name}:essential_genes"
      res.remove!
      d.result(:stats)&.remove!
      next
    end
    next if Dir["#{dir}/*.faa"].empty?

    cli.say "  > Fixing #{d.name}"
    run_cmd "      cd \#{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa\n    CMD\n  end\nend\n"

#check_files(cli) ⇒ Object

Perform files operation with MiGA::Cli cli



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/miga/cli/action/doctor.rb', line 203

def check_files(cli)
  cli.say 'Looking for outdated files in results'
  n, k = cli.load_project.dataset_names.size, 0
  cli.load_project.each_dataset do |d|
    cli.advance('Datasets:', k += 1, n, false)
    d.each_result do |r_k, r|
      ok = true
      r.each_file do |_f_sym, _f_rel, f_abs|
        unless File.exist? f_abs
          ok = false
          break
        end
      end
      unless ok
        cli.say "  > Registering again #{d.name}:#{r_k}   "
        d.add_result(r_k, true, force: true)
        sr = d.result(:stats) and sr.remove!
      end
    end
  end
  cli.say
end

#check_mts(cli) ⇒ Object

Perform mytaxa-scan operation with MiGA::Cli cli



285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# File 'lib/miga/cli/action/doctor.rb', line 285

def check_mts(cli)
  cli.say 'Looking for unarchived MyTaxa Scan runs'
  cli.load_project.each_dataset do |d|
    res = d.result(:mytaxa_scan)
    next if res.nil?

    dir = res.file_path(:regions)
    fix = false
    unless dir.nil?
      if Dir.exist? dir
        run_cmd "          cd \#{dir.shellescape}/.. \\\n              && tar -zcf '\#{d.name}.reg.tar.gz' '\#{d.name}.reg' \\\n              && rm -r '\#{d.name}.reg'\n        CMD\n      end\n      fix = true\n    end\n    %i[blast mytaxain wintax gene_ids region_ids].each do |ext|\n      file = res.file_path(ext)\n      unless file.nil?\n        FileUtils.rm(file) if File.exist? file\n        fix = true\n      end\n    end\n    if fix\n      cli.say \"  > Fixing \#{d.name}\"\n      d.add_result(:mytaxa_scan, true, force: true)\n    end\n  end\nend\n"

#check_start(cli) ⇒ Object

Perform start operation with MiGA::Cli cli



319
320
321
322
323
324
325
326
327
328
329
# File 'lib/miga/cli/action/doctor.rb', line 319

def check_start(cli)
  cli.say 'Looking for legacy .start files lingering'
  cli.load_project.each_dataset do |d|
    d.each_result do |r_k, r|
      if File.exist? r.path(:start)
        cli.say "  > Registering again #{d.name}:#{r_k}"
        r.save
      end
    end
  end
end

#check_status(cli) ⇒ Object

Perform status operation with MiGA::Cli cli



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/miga/cli/action/doctor.rb', line 65

def check_status(cli)
  cli.say 'Updating metadata status'
  p = cli.load_project
  n = p.dataset_names.size
  (0 .. cli[:threads] - 1).map do |i|
    Process.fork do
      k = 0
      cli.load_project.each_dataset do |d|
        k += 1
        cli.advance('Datasets:', k, n, false) if i == 0
        d.recalculate_status if k % cli[:threads] == i
      end
    end
  end
  Process.waitall
  cli.say
end

#check_tax(cli) ⇒ Object

Perform taxonomy operation with MiGA::Cli cli



333
334
335
336
337
338
# File 'lib/miga/cli/action/doctor.rb', line 333

def check_tax(cli)
  # cli.say 'o Checking for taxonomy/distances consistency'
  # TODO: Find 95%ANI clusters with entries from different species
  # TODO: Find different 95%ANI clusters with genomes from the same species
  # TODO: Find AAI values too high or too low for each LCA rank
end

#outdated_fastaai_ess(res) ⇒ Object

Check if the essential genes result res has an outdated FastAAI index



277
278
279
280
281
# File 'lib/miga/cli/action/doctor.rb', line 277

def outdated_fastaai_ess(res)
  idx1 = res.file_path(:fastaai_index)
  idx2 = res.file_path(:fastaai_index_2)
  idx2.nil? && !idx1.nil?
end

#parse_cliObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/miga/cli/action/doctor.rb', line 9

def parse_cli
  cli.defaults = { threads: 1 }
  cli.defaults = Hash[@@OPERATIONS.keys.map { |i| [i, true] }]
  cli.parse do |opt|
    operation_n = Hash[@@OPERATIONS.map { |k, v| [v[0], k] }]
    cli.opt_object(opt, [:project])
    opt.on(
      '--ignore TASK1,TASK2', Array,
      'Do not perform the task(s) listed. Available tasks are:',
      * @@OPERATIONS.values.map { |v| "~ #{v[0]}: #{v[1]}" }
    ) { |v| v.map { |i| cli[operation_n[i]] = false } }
    opt.on(
      '--only TASK',
      'Perform only the specified task (see --ignore)'
    ) do |v|
      op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
      @@OPERATIONS.each_key { |i| cli[i] = false }
      cli[op_k] = true
    end
    opt.on(
      '-t', '--threads INT', Integer,
      "Concurrent threads to use. By default: #{cli[:threads]}"
    ) { |v| cli[:threads] = v }
  end
end

#performObject



35
36
37
38
39
40
# File 'lib/miga/cli/action/doctor.rb', line 35

def perform
  p = cli.load_project
  @@OPERATIONS.keys.each do |k|
    send("check_#{k}", cli) if cli[k]
  end
end

#run_cmd(cmd, opts = {}) ⇒ Object

Run command cmd with options opts



342
343
344
345
346
# File 'lib/miga/cli/action/doctor.rb', line 342

def run_cmd(cmd, opts = {})
  opts = { return: :output, err2out: true, raise: false }.merge(opts)
  cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
  warn(cmdo) unless cmdo.empty?
end