Class: Bio::FinishM::InputGenome
- Inherits:
-
Object
- Object
- Bio::FinishM::InputGenome
- Includes:
- Logging
- Defined in:
- lib/assembly/input_genome.rb
Instance Attribute Summary collapse
-
#filename ⇒ Object
Returns the value of attribute filename.
-
#numbered_probes ⇒ Object
Returns the value of attribute numbered_probes.
-
#scaffolds ⇒ Object
Returns the value of attribute scaffolds.
Class Method Summary collapse
-
.parse_genome_fasta_files(fasta_files, hangover_length, options = {}) ⇒ Object
Return an array of parsed fasta files.
Instance Method Summary collapse
- #each_gap_probe_pair(scaffold_index) ⇒ Object
- #each_numbered_probe ⇒ Object
- #each_scaffold_end_numbered_probe ⇒ Object
- #first_probe(scaffold_index) ⇒ Object
-
#gap_length(scaffold_index, gap_index) ⇒ Object
The length of the gap between contig i and contig i+1 from the specified scaffold (both numbers are 0-based indices).
- #generate_numbered_probes(overhang, starting_probe_number) ⇒ Object
- #human_scaffold_end_name(probe_index) ⇒ Object
-
#initialize(genome_fasta, hangover_length, options = {}) ⇒ InputGenome
constructor
Given a fasta file, setup a genome for wandering or gapfilling.
- #last_probe(scaffold_index) ⇒ Object
- #number_of_probes ⇒ Object
-
#probe_at_start_of_scaffold?(probe_index) ⇒ Boolean
Return true if probe number given is the probe at the beginning of the scaffold or false if it is at the end.
- #probe_sequences ⇒ Object
- #remove_overly_short_contigs!(scaffolds, hangover_length) ⇒ Object
Methods included from Logging
Constructor Details
#initialize(genome_fasta, hangover_length, options = {}) ⇒ InputGenome
Given a fasta file, setup a genome for wandering or gapfilling.
Options: :starting_probe_number: number probes starting from this number (default 1)
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/assembly/input_genome.rb', line 24 def initialize(genome_fasta, hangover_length, = {}) starting_probe_number = [:starting_probe_number] starting_probe_number ||= 1 @filename = genome_fasta scaffolds = Bio::FinishM::ScaffoldBreaker.new.break_scaffolds(genome_fasta) @scaffolds = remove_overly_short_contigs!(scaffolds, hangover_length) # Remove scaffolds that have no good num_too_short_scaffolds = 0 @scaffolds.reject! do |scaff| rej = scaff.contigs.empty? num_too_short_scaffolds += 1 if rej rej end if num_too_short_scaffolds > 0 log.warn "Removed #{num_too_short_scaffolds} scaffolds entirely as they were too short (or made up of all short contigs)" end generate_numbered_probes(hangover_length, starting_probe_number) end |
Instance Attribute Details
#filename ⇒ Object
Returns the value of attribute filename.
2 3 4 |
# File 'lib/assembly/input_genome.rb', line 2 def filename @filename end |
#numbered_probes ⇒ Object
Returns the value of attribute numbered_probes.
2 3 4 |
# File 'lib/assembly/input_genome.rb', line 2 def numbered_probes @numbered_probes end |
#scaffolds ⇒ Object
Returns the value of attribute scaffolds.
2 3 4 |
# File 'lib/assembly/input_genome.rb', line 2 def scaffolds @scaffolds end |
Class Method Details
.parse_genome_fasta_files(fasta_files, hangover_length, options = {}) ⇒ Object
Return an array of parsed fasta files
6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/assembly/input_genome.rb', line 6 def self.parse_genome_fasta_files(fasta_files, hangover_length, = {}) genomes = [] current_probe_number = 1 fasta_files.each do |genome_fasta| genome = Bio::FinishM::InputGenome.new( genome_fasta, hangover_length, :starting_probe_number => current_probe_number ) current_probe_number += genome.number_of_probes genomes.push genome end return genomes end |
Instance Method Details
#each_gap_probe_pair(scaffold_index) ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/assembly/input_genome.rb', line 113 def each_gap_probe_pair(scaffold_index) last_probe_pair = nil @numbered_probes[scaffold_index].each do |probe_pair| unless probe_pair.nil? unless last_probe_pair.nil? yield last_probe_pair[1], probe_pair[0] end last_probe_pair = probe_pair end end end |
#each_numbered_probe ⇒ Object
107 108 109 110 111 |
# File 'lib/assembly/input_genome.rb', line 107 def each_numbered_probe @numbered_probes.flatten.each do |probe| yield probe end end |
#each_scaffold_end_numbered_probe ⇒ Object
125 126 127 128 129 130 131 |
# File 'lib/assembly/input_genome.rb', line 125 def each_scaffold_end_numbered_probe @numbered_probes.each_with_index do |scaffold_indices, i| # yield the first and last probe of this scaffold yield scaffold_indices[0][0] yield scaffold_indices[-1][1] end end |
#first_probe(scaffold_index) ⇒ Object
133 134 135 |
# File 'lib/assembly/input_genome.rb', line 133 def first_probe(scaffold_index) @numbered_probes[scaffold_index][0][0] end |
#gap_length(scaffold_index, gap_index) ⇒ Object
The length of the gap between contig i and contig i+1 from the specified scaffold (both numbers are 0-based indices)
166 167 168 |
# File 'lib/assembly/input_genome.rb', line 166 def gap_length(scaffold_index, gap_index) @scaffolds[scaffold_index].gaps[gap_index].length end |
#generate_numbered_probes(overhang, starting_probe_number) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/assembly/input_genome.rb', line 61 def generate_numbered_probes(overhang, starting_probe_number) @numbered_probes = [] @probe_number_to_scaffold_and_contig_and_side = {} current_probe_number = starting_probe_number overly_short_sequence_count = 0 @scaffolds.each_with_index do |scaffold, scaffold_index| scaffold.contigs.each_with_index do |contig, contig_index| if contig.sequence.length < 2*overhang log.warn "Not attempting to make connections from overly short contig: it is the #{contig_index+1}th contig in scaffold `#{scaffold.name}' from the genome in `#{@filename}')" overly_short_sequence_count += 1 nil else sequence = contig.sequence probe1 = NumberedProbe.new probe1.contig = contig probe1.number = current_probe_number; current_probe_number += 1 probe1.side = :start fwd2 = Bio::Sequence::NA.new(sequence[0...overhang]) probe1.sequence = fwd2.reverse_complement.to_s probe2 = NumberedProbe.new probe2.contig = contig probe2.number = current_probe_number; current_probe_number += 1 probe2.side = :end probe2.sequence = sequence[(sequence.length-overhang)...sequence.length].to_s @numbered_probes[scaffold_index] ||= [] @numbered_probes[scaffold_index][contig_index] = [probe1, probe2] @probe_number_to_scaffold_and_contig_and_side[probe1.number] = [scaffold, contig, :start] @probe_number_to_scaffold_and_contig_and_side[probe2.number] = [scaffold, contig, :end] end end end log.debug "Generated #{current_probe_number-starting_probe_number} probes for #{@filename}" if log.debug? if overly_short_sequence_count > 0 log.warn "Skipping #{overly_short_sequence_count} overly short contigs" if log.warn? end end |
#human_scaffold_end_name(probe_index) ⇒ Object
170 171 172 173 |
# File 'lib/assembly/input_genome.rb', line 170 def human_scaffold_end_name(probe_index) scaffold, contig, side = @probe_number_to_scaffold_and_contig_and_side[probe_index] "scaffold '#{scaffold.name}' contig '#{genome.contig_number(contig) }' #{side.to_s}" end |
#last_probe(scaffold_index) ⇒ Object
137 138 139 |
# File 'lib/assembly/input_genome.rb', line 137 def last_probe(scaffold_index) @numbered_probes[scaffold_index][-1][1] end |
#number_of_probes ⇒ Object
103 104 105 |
# File 'lib/assembly/input_genome.rb', line 103 def number_of_probes @numbered_probes.flatten.length end |
#probe_at_start_of_scaffold?(probe_index) ⇒ Boolean
Return true if probe number given is the probe at the beginning of the scaffold or false if it is at the end. raise if unknown.
143 144 145 146 147 148 149 150 151 152 |
# File 'lib/assembly/input_genome.rb', line 143 def probe_at_start_of_scaffold?(probe_index) scaffold, contig, side = @probe_number_to_scaffold_and_contig_and_side[probe_index] if side == :start return true elsif side == :end return false else raise end end |
#probe_sequences ⇒ Object
154 155 156 157 158 159 160 161 162 |
# File 'lib/assembly/input_genome.rb', line 154 def probe_sequences seqs = [] each_numbered_probe do |probe| unless probe.nil? seqs.push probe.sequence end end return seqs end |
#remove_overly_short_contigs!(scaffolds, hangover_length) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/assembly/input_genome.rb', line 46 def remove_overly_short_contigs!(scaffolds, hangover_length) num_contigs_removed = 0 scaffolds.each do |scaffold| num_original_contigs = scaffold.contigs.length scaffold.contigs.reject! do |contig| contig.length < 2*hangover_length end num_contigs_removed += num_original_contigs - scaffold.contigs.length end if num_contigs_removed > 0 log.warn "Removed #{num_contigs_removed} contigs from within scaffolds as they were too short" end return scaffolds end |