Class: Bio::FinishM::ScaffoldBreaker

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/assembly/scaffold_breaker.rb

Defined Under Namespace

Classes: Gap, Scaffold, UnscaffoldedContig

Instance Method Summary collapse

Methods included from Logging

#log

Instance Method Details

#break_scaffolds(contigs_filename) ⇒ Object

Given a path to a scaffold fasta file, read in the scaffolds, and break them apart into constituent contigs. Then return an array of Scaffold objects containing the contig information therein.



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/assembly/scaffold_breaker.rb', line 96

def break_scaffolds(contigs_filename)
  scaffolds = []
  Bio::FlatFile.foreach(Bio::FastaFormat, contigs_filename) do |seq|
    scaffold = Scaffold.new
    scaffold.name = seq.definition

    unless seq.seq.match(/^[ATGCN]+$/i)
      example = seq.seq.match(/([^ATGCN])/i)[1]
      log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}. Replacing them with Ns"
      seq.seq.gsub! /[^ATGCN]/i, 'N'
    end

    if seq.seq.match(/^N+$/i)
      raise "Found a scaffold that contains all N characters, ignoring this (perhaps your input is mangled?): #{scaffold.name}"
    end

    # Find all Ns in the current sequence
    seq.seq.scan(/([^N]+)/i) do
      contig = UnscaffoldedContig.new
      contig.scaffold = scaffold
      contig.scaffold_position_start = $~.offset(0)[0]+1#Convert to 1-based indices in line with bioruby
      contig.scaffold_position_end = $~.offset(0)[1]
      contig.sequence = $~.to_s
      scaffold.contigs ||= []
      scaffold.contigs.push contig
    end
    scaffolds.push scaffold
  end
  log.info "Detected #{scaffolds.length} scaffolds, containing #{scaffolds.collect{|s| s.contigs.length}.reduce(:+)} different contigs"
  return scaffolds
end