Class: Bio::FinishM::GapFiller
- Inherits:
-
Object
- Object
- Bio::FinishM::GapFiller
- Includes:
- Logging
- Defined in:
- lib/finishm/gapfiller.rb
Instance Method Summary collapse
- #add_options(optparse_object, options) ⇒ Object
-
#gapfill(finishm_graph, probe_index1, probe_index2, options) ⇒ Object
Given a finishm graph, gapfill from the first probe to the second.
- #run(options, argv) ⇒ Object
- #validate_options(options, argv) ⇒ Object
Methods included from Logging
Instance Method Details
#add_options(optparse_object, options) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/finishm/gapfiller.rb', line 6 def (optparse_object, ) optparse_object. = "\nUsage: finishm gapfill --contigs <contigs_file> --fastq-gz <reads..> --output-fasta <output.fa> Takes a set of reads and a contig that contains gap characters. Then it tries to fill in these N characters. It is possible that there is multiple ways to close the gap - in that case each can be reported. example: finishm gapfill --contigs to_gapfill.fasta --fastq-gz reads.1.fq.gz,reads.2.fq.gz --output-fasta output.fasta \n" .merge!({ :contig_end_length => 200, :graph_search_leash_length => 20000, }) optparse_object.separator "\nRequired arguments:\n\n" optparse_object.on("--contigs FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg| [:contigs_file] = arg end optparse_object.on("--output-fasta PATH", "Output the gap-filled sequence to this file [required]") do |arg| [:overall_fasta_file] = arg end optparse_object.separator "\nThere must be some definition of of how to do the assembly, or else a path to a previous assembly directory:\n\n" Bio::FinishM::ReadInput.new.(optparse_object, ) Bio::FinishM::GraphGenerator.new. optparse_object, optparse_object.separator "\nGraph search options:\n\n" optparse_object.on("--overhang NUM", Integer, "Start assembling this many base pairs back from the gap [default: #{options[:contig_end_length] }]") do |arg| [:contig_end_length] = arg end optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this many base pairs and not (much) more [default: #{options[:graph_search_leash_length] }]") do |arg| [:graph_search_leash_length] = arg end optparse_object.on("--recoherence-kmer NUM", Integer, "Use a kmer longer than the original velvet one, to help remove bubbles and circular paths [default: none]") do |arg| [:recoherence_kmer] = arg end optparse_object.separator "\nVisualisation options (of all joins):\n\n" optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg| [:output_graph_png] = arg end optparse_object.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg| [:output_graph_svg] = arg end optparse_object.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg| [:output_graph_dot] = arg end end |
#gapfill(finishm_graph, probe_index1, probe_index2, options) ⇒ Object
Given a finishm graph, gapfill from the first probe to the second. Return a Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection object
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
# File 'lib/finishm/gapfiller.rb', line 282 def gapfill(finishm_graph, probe_index1, probe_index2, ) start_onode = finishm_graph.velvet_oriented_node(probe_index1) end_onode_inward = finishm_graph.velvet_oriented_node(probe_index2) unless start_onode and end_onode_inward raise "Unable to retrieve both probes from the graph for gap #{gap_number} (#{gap.coords}), fail" end # The probe from finishm_graph points in the wrong direction for path finding end_onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new end_onode.node = end_onode_inward.node end_onode.first_side = end_onode_inward.starts_at_start? ? Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST : Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST adjusted_leash_length = finishm_graph.adjusted_leash_length(probe_index1, [:graph_search_leash_length]) log.debug "Using adjusted leash length #{adjusted_leash_length }" if log.debug? cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new trails = cartographer.find_trails_between_nodes( finishm_graph.graph, start_onode, end_onode, adjusted_leash_length, { :recoherence_kmer => [:recoherence_kmer], :sequences => finishm_graph.velvet_sequences, :max_explore_nodes => [:max_explore_nodes], :max_gapfill_paths => [:max_gapfill_paths], } ) if trails.circular_paths_detected log.warn "Circular path detected here, not attempting to gapfill" end # Convert the trails into OrientedNodePaths trails = trails.collect do |trail| path = Bio::Velvet::Graph::OrientedNodeTrail.new path.trail = trail path end acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new acon.start_probe_noded_read = finishm_graph.probe_node_reads[probe_index1] acon.end_probe_noded_read = finishm_graph.probe_node_reads[probe_index2] acon.start_probe_contig_offset = [:contig_end_length] acon.end_probe_contig_offset = [:contig_end_length] acon.paths = trails return acon end |
#run(options, argv) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
# File 'lib/finishm/gapfiller.rb', line 76 def run(, argv) # Read in all the contigs sequences and work out where the gaps are genome = Bio::FinishM::InputGenome.new( [:contigs_file], [:contig_end_length], ) scaffolds = Bio::FinishM::ScaffoldBreaker.new.break_scaffolds([:contigs_file]) gaps = [] output_fasta_file = File.open([:overall_fasta_file],'w') num_without_gaps = 0 scaffolds.each do |scaffold| sgaps = scaffold.gaps if sgaps.empty? num_without_gaps += 1 output_fasta_file.puts ">#{scaffold.name }" output_fasta_file.puts scaffold.sequence else gaps.push scaffold.gaps end end gaps.flatten! log.info "Detected #{gaps.length} gap(s) from #{scaffolds.length} different contig(s). #{num_without_gaps } contig(s) were gap-free." # Create probe sequences probe_sequences = [] gaps.each do |gap| sequence = gap.scaffold.sequence if gap.start < [:contig_end_length] or gap.stop > sequence.length - [:contig_end_length] log.warn "Found a gap that was too close to the end of a contig, skipping it: #{gap.coords}" next end log.debug "Processing gap number #{gap.number}, #{gap.coords}" first_coords = [ gap.start-[:contig_end_length]-1, gap.start-1, ] second_coords = [ gap.stop, (gap.stop+[:contig_end_length]), ] log.debug "Coordinates of the probes are #{first_coords} and #{second_coords}" second = sequence[second_coords[0]..second_coords[1]] probes = [ sequence[first_coords[0]...first_coords[1]], Bio::Sequence::NA.new(second).reverse_complement.to_s, ] #TODO: this could probably be handled better.. e.g. if the amount of sequence is too small, just throw it out and make one big gap if probes[0].match(/N/i) or probes[1].match(/N/i) log.warn "Noticed gap that was too close together, skipping: #{gap.coords}" next end probe_sequences.push probes[0] probe_sequences.push probes[1] end log.debug "Generated #{probe_sequences.length} probes e.g. #{probe_sequences[0] }" # Generate the graph with the probe sequences in it. read_input = Bio::FinishM::ReadInput.new read_input. # Own the tmpdir, if one is to be used - need to re-read the LastGraph later on see.. assembly_directory = [:output_assembly_path] assembly_directory ||= [:previous_assembly] using_tmp_assembly_directory = false if assembly_directory.nil? using_tmp_assembly_directory = true assembly_directory = Dir.mktmpdir [:output_assembly_path] = assembly_directory end # Do the actual graph building and/or initial reading [:parse_sequences] = true finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, ) # Output optional graphics. if [:output_graph_png] or [:output_graph_svg] or [:output_graph_dot] viser = Bio::Assembly::ABVisualiser.new # TODO: make these visualise more than one join somehow gv = viser.graphviz(finishm_graph.graph, { :start_node_id => finishm_graph.probe_nodes[0].node_id, :end_node_id => finishm_graph.probe_nodes[1].node_id}) if [:output_graph_png] log.info "Converting assembly to a graphviz PNG" gv.output :png => [:output_graph_png], :use => :neato end if [:output_graph_svg] log.info "Converting assembly to a graphviz SVG" gv.output :svg => [:output_graph_svg], :use => :neato end if [:output_graph_dot] log.info "Converting assembly to a graphviz DOT" gv.output :dot => [:output_graph_dot] end end # Clean up the tmdir, if one was used. if using_tmp_assembly_directory log.debug "Removing tmpdir that held the assembly `#{assembly_directory}'.." FileUtils.remove_entry assembly_directory end # Do the gap-filling and print out the results printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new num_total_trails = 0 num_singly_filled = 0 num_unbridgable = 0 output_trails_file = nil output_trails_file = File.open([:overall_trail_output_fasta_file],'w') unless [:overall_trail_output_fasta_file].nil? # Print the fasta output for the scaffold print_scaffold = lambda do |last_scaffold, gapfilled_sequence| output_fasta_file.puts ">#{last_scaffold.name }" #gapfilled_sequence += last_scaffold.contigs[last_scaffold.contigs.length-1].sequence #add last contig output_fasta_file.puts gapfilled_sequence end # Lambda to add a gap the the String representing the scaffold #TODO: if the trail is not filled then the wrong sequence is currently printed. BUG??? filler = lambda do |anchored_connection, following_contig, gapfilled_sequence, gap| gapfilled = nil if anchored_connection.paths.length == 1 # If there is only 1 trail, then output scaffolding information num_singly_filled += 1 gapfilled = printer.one_connection_between_two_contigs( finishm_graph.graph, gapfilled_sequence, anchored_connection, following_contig.sequence ) else # Otherwise don't make any assumptions num_unbridgable += 1 if anchored_connection.paths.empty? # TODO: even the there is multiple trails, better info can still be output here gapfilled = gapfilled_sequence + 'N'*gap.length + following_contig.sequence end gapfilled #return this string end log.info "Searching for trails between the nodes within the assembly graph" log.info "Using contig overhang length #{options[:contig_end_length] } and leash length #{options[:graph_search_leash_length] }" gapfilled_sequence = '' last_scaffold = nil (0...(probe_sequences.length / 2)).collect{|i| i*2}.each do |start_probe_index| gap_number = start_probe_index / 2 gap = gaps[gap_number] log.info "Now working through gap number #{gap_number+1}: #{gap.coords}" probe_index1 = start_probe_index probe_index2 = start_probe_index+1 connection = gapfill(finishm_graph, probe_index1, probe_index2, ) log.info "Found #{connection.paths.length} trails for #{gap.coords}" unless output_trails_file.nil? # print the sequences of the trails if asked for: trails.each_with_index do |trail, i| #TODO: need to output this as something more sensible e.g. VCF format output_trails_file.puts ">#{gap.coords}_trail#{i+1}" output_trails_file.puts trail.sequence end end num_total_trails += connection.paths.length # Output the updated sequence. Fill in the sequence if there is only 1 trail if gap.scaffold == last_scaffold # We are still building the current scaffold #gapfilled_sequence += gap.scaffold.contigs[gap.number].sequence log.debug "Before adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug? gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap log.debug "After adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug? else # We are onto a new scaffold. Print the previous one (unless this the first one) unless last_scaffold.nil? # print the gapfilled (or not) scaffold. print_scaffold.call(last_scaffold, gapfilled_sequence) end #reset last_scaffold = gap.scaffold #add the current gap (and the contig before it) log.debug "Before adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" gapfilled_sequence = gap.scaffold.contigs[gap.number].sequence log.debug "After adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap log.debug "After adding first gap sequence and next contig, gapfilled sequence length is #{gapfilled_sequence.length}" end end print_scaffold.call(last_scaffold, gapfilled_sequence) # print the last scaffold log.info "#{num_unbridgable } gaps had no suitable bridging paths in the graph within the leash, and found #{num_total_trails} trails in total." log.info "Filled #{num_singly_filled } out of #{gaps.length } gaps." output_trails_file.close unless output_trails_file.nil? output_fasta_file.close end |
#validate_options(options, argv) ⇒ Object
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/finishm/gapfiller.rb', line 56 def (, argv) #TODO: give a better description of the error that has occurred #TODO: require reads options if argv.length != 0 return "Dangling argument(s) found e.g. #{argv[0] }" else [ :contigs_file, :overall_fasta_file ].each do |sym| if [sym].nil? return "No option found to specify #{sym}" end end #if return nil from here, options all were parsed successfully return Bio::FinishM::ReadInput.new.(, []) end end |