Class: Bio::FinishM::InputGenome

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/assembly/input_genome.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Logging

#log

Constructor Details

#initialize(genome_fasta, hangover_length, options = {}) ⇒ InputGenome

Given a fasta file, setup a genome for wandering or gapfilling.

Options: :starting_probe_number: number probes starting from this number (default 1)



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/assembly/input_genome.rb', line 24

def initialize(genome_fasta, hangover_length, options = {})
  starting_probe_number = options[:starting_probe_number]
  starting_probe_number ||= 1

  @filename = genome_fasta
  scaffolds = Bio::FinishM::ScaffoldBreaker.new.break_scaffolds(genome_fasta)
  @scaffolds = remove_overly_short_contigs!(scaffolds, hangover_length)

  # Remove scaffolds that have no good
  num_too_short_scaffolds = 0
  @scaffolds.reject! do |scaff|
    rej = scaff.contigs.empty?
    num_too_short_scaffolds += 1 if rej
    rej
  end
  if num_too_short_scaffolds > 0
    log.warn "Removed #{num_too_short_scaffolds} scaffolds entirely as they were too short (or made up of all short contigs)"
  end

  generate_numbered_probes(hangover_length, starting_probe_number)
end

Instance Attribute Details

#filenameObject

Returns the value of attribute filename.



2
3
4
# File 'lib/assembly/input_genome.rb', line 2

def filename
  @filename
end

#numbered_probesObject

Returns the value of attribute numbered_probes.



2
3
4
# File 'lib/assembly/input_genome.rb', line 2

def numbered_probes
  @numbered_probes
end

#scaffoldsObject

Returns the value of attribute scaffolds.



2
3
4
# File 'lib/assembly/input_genome.rb', line 2

def scaffolds
  @scaffolds
end

Class Method Details

.parse_genome_fasta_files(fasta_files, hangover_length, options = {}) ⇒ Object

Return an array of parsed fasta files



6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/assembly/input_genome.rb', line 6

def self.parse_genome_fasta_files(fasta_files, hangover_length, options = {})
  genomes = []
  current_probe_number = 1
  fasta_files.each do |genome_fasta|
    genome = Bio::FinishM::InputGenome.new(
      genome_fasta, hangover_length, :starting_probe_number => current_probe_number
      )
    current_probe_number += genome.number_of_probes

    genomes.push genome
  end
  return genomes
end

Instance Method Details

#each_gap_probe_pair(scaffold_index) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
# File 'lib/assembly/input_genome.rb', line 113

def each_gap_probe_pair(scaffold_index)
  last_probe_pair = nil
  @numbered_probes[scaffold_index].each do |probe_pair|
    unless probe_pair.nil?
      unless last_probe_pair.nil?
        yield last_probe_pair[1], probe_pair[0]
      end
      last_probe_pair = probe_pair
    end
  end
end

#each_numbered_probeObject



107
108
109
110
111
# File 'lib/assembly/input_genome.rb', line 107

def each_numbered_probe
  @numbered_probes.flatten.each do |probe|
    yield probe
  end
end

#each_scaffold_end_numbered_probeObject



125
126
127
128
129
130
131
# File 'lib/assembly/input_genome.rb', line 125

def each_scaffold_end_numbered_probe
  @numbered_probes.each_with_index do |scaffold_indices, i|
    # yield the first and last probe of this scaffold
    yield scaffold_indices[0][0]
    yield scaffold_indices[-1][1]
  end
end

#first_probe(scaffold_index) ⇒ Object



133
134
135
# File 'lib/assembly/input_genome.rb', line 133

def first_probe(scaffold_index)
  @numbered_probes[scaffold_index][0][0]
end

#gap_length(scaffold_index, gap_index) ⇒ Object

The length of the gap between contig i and contig i+1 from the specified scaffold (both numbers are 0-based indices)



166
167
168
# File 'lib/assembly/input_genome.rb', line 166

def gap_length(scaffold_index, gap_index)
  @scaffolds[scaffold_index].gaps[gap_index].length
end

#generate_numbered_probes(overhang, starting_probe_number) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/assembly/input_genome.rb', line 61

def generate_numbered_probes(overhang, starting_probe_number)
  @numbered_probes = []
  @probe_number_to_scaffold_and_contig_and_side = {}

  current_probe_number = starting_probe_number
  overly_short_sequence_count = 0
  @scaffolds.each_with_index do |scaffold, scaffold_index|
    scaffold.contigs.each_with_index do |contig, contig_index|
      if contig.sequence.length < 2*overhang
        log.warn "Not attempting to make connections from overly short contig: it is the #{contig_index+1}th contig in scaffold `#{scaffold.name}' from the genome in `#{@filename}')"
        overly_short_sequence_count += 1
        nil
      else
        sequence = contig.sequence

        probe1 = NumberedProbe.new
        probe1.contig = contig
        probe1.number = current_probe_number; current_probe_number += 1
        probe1.side = :start
        fwd2 = Bio::Sequence::NA.new(sequence[0...overhang])
        probe1.sequence = fwd2.reverse_complement.to_s

        probe2 = NumberedProbe.new
        probe2.contig = contig
        probe2.number = current_probe_number; current_probe_number += 1
        probe2.side = :end
        probe2.sequence = sequence[(sequence.length-overhang)...sequence.length].to_s

        @numbered_probes[scaffold_index] ||= []
        @numbered_probes[scaffold_index][contig_index] = [probe1, probe2]

        @probe_number_to_scaffold_and_contig_and_side[probe1.number] = [scaffold, contig, :start]
        @probe_number_to_scaffold_and_contig_and_side[probe2.number] = [scaffold, contig, :end]
      end
    end
  end
  log.debug "Generated #{current_probe_number-starting_probe_number} probes for #{@filename}" if log.debug?
  if overly_short_sequence_count > 0
    log.warn "Skipping #{overly_short_sequence_count} overly short contigs" if log.warn?
  end
end

#human_scaffold_end_name(probe_index) ⇒ Object



170
171
172
173
# File 'lib/assembly/input_genome.rb', line 170

def human_scaffold_end_name(probe_index)
  scaffold, contig, side = @probe_number_to_scaffold_and_contig_and_side[probe_index]
  "scaffold '#{scaffold.name}' contig '#{genome.contig_number(contig) }' #{side.to_s}"
end

#last_probe(scaffold_index) ⇒ Object



137
138
139
# File 'lib/assembly/input_genome.rb', line 137

def last_probe(scaffold_index)
  @numbered_probes[scaffold_index][-1][1]
end

#number_of_probesObject



103
104
105
# File 'lib/assembly/input_genome.rb', line 103

def number_of_probes
  @numbered_probes.flatten.length
end

#probe_at_start_of_scaffold?(probe_index) ⇒ Boolean

Return true if probe number given is the probe at the beginning of the scaffold or false if it is at the end. raise if unknown.

Returns:

  • (Boolean)


143
144
145
146
147
148
149
150
151
152
# File 'lib/assembly/input_genome.rb', line 143

def probe_at_start_of_scaffold?(probe_index)
  scaffold, contig, side = @probe_number_to_scaffold_and_contig_and_side[probe_index]
  if side == :start
    return true
  elsif side == :end
    return false
  else
    raise
  end
end

#probe_sequencesObject



154
155
156
157
158
159
160
161
162
# File 'lib/assembly/input_genome.rb', line 154

def probe_sequences
  seqs = []
  each_numbered_probe do |probe|
    unless probe.nil?
      seqs.push probe.sequence
    end
  end
  return seqs
end

#remove_overly_short_contigs!(scaffolds, hangover_length) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/assembly/input_genome.rb', line 46

def remove_overly_short_contigs!(scaffolds, hangover_length)
  num_contigs_removed = 0
  scaffolds.each do |scaffold|
    num_original_contigs = scaffold.contigs.length
    scaffold.contigs.reject! do |contig|
      contig.length < 2*hangover_length
    end
    num_contigs_removed += num_original_contigs - scaffold.contigs.length
  end
  if num_contigs_removed > 0
    log.warn "Removed #{num_contigs_removed} contigs from within scaffolds as they were too short"
  end
  return scaffolds
end