Module: Aai

Includes:: Utils

Defined in:: lib/aai/utils.rb,
lib/aai/version.rb,
lib/aai/core_extensions.rb,
lib/aai.rb

Defined Under Namespace

Constant Summary collapse

VERSION =

"0.5.4"

COPYRIGHT =

"2017 Ryan Moore"

CONTACT =

"[email protected]"

WEBSITE =

"https://github.com/mooreryan/aai"

LICENSE =

"MIT"

VERSION_BANNER =

"  # Version: #{VERSION}
# Copyright #{COPYRIGHT}
# Contact: #{CONTACT}
# Website: #{WEBSITE}
# License: #{LICENSE}"

BLAST_DB_SEPARATOR =

"...."

BLAST_DB_SUFFIX =

"#{BLAST_DB_SEPARATOR}aai"

PIDENT_CUTOFF =

EVALUE_CUTOFF =

1e-3

LENGTH_CUTOFF = actually is 70 percent

Instance Method Summary collapse

#aai_strings(one_way_aai, two_way_aai) ⇒ Object

Returns an array (enumerable) of aai strings ready to print.
#blast_permutations!(fastas, blast_dbs, cpus = 4) ⇒ Object
#get_best_hits(fnames, seq_lengths) ⇒ Object
#make_blastdbs!(fnames, cpus = 4) ⇒ Array<String>

Make blast dbs given an array of filenames.
#one_way_aai(best_hits) ⇒ Object
#process_input_seqs!(fnames) ⇒ Object

Returns a hash table with sequence lengths and writes new fasta files with clean headers for blast.
#two_way_aai(best_hits) ⇒ Object

Methods included from Utils

#check_command, #check_files, #clean_str, #command?, #one_way_combinations, #two_ary_permutations

Instance Method Details

#aai_strings(one_way_aai, two_way_aai) ⇒ `Object`

Returns an array (enumerable) of aai strings ready to print

# File 'lib/aai.rb', line 274

def aai_strings one_way_aai, two_way_aai
  aai_strings = {}
  keys = (one_way_aai.keys + two_way_aai.keys).
         map { |key| key.sort }.uniq

  keys.each do |key|
    a_to_b_aai = one_way_aai[key] || "NA"
    b_to_a_aai = one_way_aai[key.reverse] || "NA"
    two_way = two_way_aai[key] || "NA"

    aai_strings[key] = [a_to_b_aai,
                        b_to_a_aai,
                        two_way]
  end

  aai_strings.map do |genome_pair, aais|
    [genome_pair, aais].flatten.join "\t"
  end
end

#blast_permutations!(fastas, blast_dbs, cpus = 4) ⇒ `Object`

# File 'lib/aai.rb', line 24

def blast_permutations! fastas, blast_dbs, cpus=4
  file_permutations = one_way_combinations fastas, blast_dbs, true
  file_permutations = file_permutations.select do |f1, f2|
    genome_from_fname(f1) != genome_from_fname(f2)
  end

  first_files = file_permutations.map(&:first)
  second_files = file_permutations.map(&:last)

  first_genomes = first_files.map do |fname|
    ary = fname.split(".")
    ary.take(ary.length - 1).join
  end

  second_genomes = second_files.map do |fname|
    ary = fname.split(BLAST_DB_SEPARATOR).take(1)
    AbortIf.abort_unless ary.length == 1,
                 "Bad file name for #{fname}"

    ary = ary.first.split(".")

    File.basename ary.take(ary.length - 1).join
  end

  outf_names = first_genomes.zip(second_genomes).map do |f1, f2|
    "#{f1}____#{f2}.aai_blastp"
  end

  args = first_files.length.times.map do |idx|
    [first_files[idx], second_files[idx], outf_names[idx]]
  end

  Parallel.each(args, in_processes: cpus) do |infiles|
    query = infiles[0]
    db    = infiles[1]
    out   = infiles[2]

    cmd = "diamond blastp --threads 1 --outfmt 6 " +
          "--query #{query} --db #{db} --out #{out} " +
          "--evalue #{EVALUE_CUTOFF}"

    Process.run_and_time_it! "Diamond blast", cmd
  end

  outf_names
end

#get_best_hits(fnames, seq_lengths) ⇒ `Object`

# File 'lib/aai.rb', line 122

def get_best_hits fnames, seq_lengths
  best_hits = {}
  num_fnames = fnames.count
  log_every = num_fnames > 10 ? num_fnames / 10 : 1
  fnames.each_with_index do |fname, idx| # blast files
    if (idx % log_every).zero?
      AbortIf.logger.debug { "Working on blastp file " +
                             "##{idx} of #{num_fnames}"}
    end

    File.open(fname, "rt").each_line do |line|
      ary = line.chomp.split "\t"

      query  = ary[0]
      target = ary[1]
      pident = ary[2].to_f
      length = ary[3].to_f
      evalue = ary[10].to_f

      query_genome = query.split("____").first
      query_seq    = query.split("____").last

      target_genome = target.split("____").first
      target_seq    = target.split("____").last

      seq_length_key = "#{query_genome}____#{query_seq}"

      AbortIf.abort_unless seq_lengths.has_key?(seq_length_key),
                           "#{seq_length_key} is missing from " +
                           "seq_lengths"

      query_length = seq_lengths[seq_length_key].to_f
      length_percent = length / query_length * 100

      hit_info = {
        query_name: query_seq,
        target_name: target_seq,
        query_genome: query_genome,
        target_genome: target_genome,
        pident: pident,
        length: length_percent,
        evalue: evalue
      }

      # check if it is a best hit candidate
      if pident >= PIDENT_CUTOFF &&
         evalue <= EVALUE_CUTOFF &&
         length_percent >= LENGTH_CUTOFF

        if best_hits.has_key? query_genome
          if best_hits[query_genome].has_key? query_seq
            if best_hits[query_genome][query_seq].
                has_key?(target_genome)
              # check if we should replace the best hit?
              current_best_hit =
                best_hits[query_genome][query_seq][target_genome]

              if pident >= current_best_hit[:pident]
                best_hits[query_genome][query_seq][target_genome] =
                  hit_info
              end
            else
              best_hits[query_genome][query_seq][target_genome] =
                hit_info
            end
          else
            best_hits[query_genome][query_seq] = {
              target_genome => hit_info
            }
          end
        else
          best_hits[query_genome] = {
            query_seq => {
              target_genome => hit_info
            }
          }
        end
      else
        # pass
      end
    end
  end

  best_hits
end

#make_blastdbs!(fnames, cpus = 4) ⇒ `Array<String>`

Make blast dbs given an array of filenames.

Parameters:

fnames (Array<String>) —

an array of filenames
cpus (Integer) (defaults to: 4) —

number of cpus to use

Returns:

(Array<String>) —

blast db basenames

# File 'lib/aai.rb', line 77

def make_blastdbs! fnames, cpus=4
  suffix = BLAST_DB_SUFFIX
  outfiles = fnames.map { |fname| fname + suffix }

  Parallel.each(fnames, in_processes: cpus) do |fname|
    cmd = "diamond makedb --threads 1 --in #{fname} " +
          "--db #{fname}#{BLAST_DB_SUFFIX}"

    Process.run_and_time_it! "Make db", cmd
  end

  outfiles
end

#one_way_aai(best_hits) ⇒ `Object`

# File 'lib/aai.rb', line 208

def one_way_aai best_hits
  one_way_best_hits(best_hits).map do |genome_pair, best_hits|
    [genome_pair, best_hits.map { |hit| hit[:pident] }.reduce(:+) /
                  best_hits.length.to_f]
  end.to_h
end

#process_input_seqs!(fnames) ⇒ `Object`

Returns a hash table with sequence lengths and writes new fasta files with clean headers for blast.

The sequences are annotated with the genome that they came from.

Parameters:

fnames (Array<String>) —

an array of fasta file names

# File 'lib/aai.rb', line 97

def process_input_seqs! fnames
  seq_lengths = {}
  clean_fnames = []

  fnames.each do |fname|
    clean_fname = fname + "_aai_clean"
    clean_fnames << clean_fname
    File.open(clean_fname, "w") do |f|
      Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
        unless bad_seq? rec.seq
          header =
            annotate_header clean_header(rec.header),
                            File.basename(fname)

          seq_lengths[header] = rec.seq.length

          f.puts ">#{header}\n#{rec.seq}"
        end
      end
    end
  end

  [seq_lengths, clean_fnames]
end

#two_way_aai(best_hits) ⇒ `Object`

# File 'lib/aai.rb', line 215

def two_way_aai best_hits
  # the pair key is the [g1, g2].sort

  two_way_aai = {}

  one_way_hits = one_way_best_hits best_hits
  genome_pair_keys = one_way_hits.keys.map { |pair| pair.sort }.uniq

  num_genome_pairs = genome_pair_keys.count
  log_every = num_genome_pairs > 10 ? num_genome_pairs / 10 : 1
  genome_pair_keys.each_with_index do |pair_key, idx|
    if (idx % log_every).zero?
      AbortIf.logger.debug { "Working on genome pair ##{idx} of " +
                             "#{num_genome_pairs}" }
    end

    if one_way_hits.has_key?(pair_key) &&
       one_way_hits.has_key?(pair_key.reverse)

      forward_hits = one_way_hits[pair_key]
      reverse_hits = one_way_hits[pair_key.reverse]

      combinations = one_way_combinations forward_hits, reverse_hits

      two_way_hits = combinations.select do |h1, h2|
        two_way_hit? h1, h2
      end

      two_way_hit_info = two_way_hits.map do |h1, h2|
        { genome_pair: [h1[:query_genome],
                        h1[:target_genome]].sort,
          pident: (h1[:pident] + h2[:pident]) / 2.0 }
      end

      two_way_hit_info.each do |hit|
        if two_way_aai.has_key? hit[:genome_pair]
          two_way_aai[hit[:genome_pair]] << hit[:pident]
        else
          two_way_aai[hit[:genome_pair]] = [hit[:pident]]
        end
      end
    elsif !one_way_hits.has_key?(pair_key)
      AbortIf.logger.warn { "No pair info for #{pair_key}. " +
                            "No two way hits possible " +
                            "for #{pair_key}." }
    elsif !one_way_hits.has_key?(pair_key.reverse)
      AbortIf.logger.warn { "No pair info for #{pair_key.reverse}. " +
                            "No two way hits possible " +
                            "for #{pair_key}." }
    end
  end

  # outside of genome_pair_keys.each
  two_way_aai.map do |genome_pair, pidents|
    [genome_pair, pidents.reduce(:+) / pidents.length.to_f]
  end.to_h
end

Module: Aai

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Methods included from Utils

Instance Method Details

#aai_strings(one_way_aai, two_way_aai) ⇒ Object

#blast_permutations!(fastas, blast_dbs, cpus = 4) ⇒ Object

#get_best_hits(fnames, seq_lengths) ⇒ Object

#make_blastdbs!(fnames, cpus = 4) ⇒ Array<String>

#one_way_aai(best_hits) ⇒ Object

#process_input_seqs!(fnames) ⇒ Object