Class: Bio::AssemblyGraphAlgorithms::SingleEndedAssembler

Inherits:
Object
  • Object
show all
Includes:
FinishM::Logging
Defined in:
lib/assembly/single_ended_assembler.rb

Direct Known Subclasses

BubblyAssembler, PairedEndAssembler

Defined Under Namespace

Classes: MaxDistancedOrientedNode

Constant Summary collapse

DEFAULT_MAX_TIP_LENGTH =
200
DEFAULT_MIN_CONTIG_SIZE =
500
DEFAULT_MIN_CONFIRMING_RECOHERENCE_READS =
2
ASSEMBLY_OPTIONS =
[
:max_tip_length,
:recoherence_kmer,
:min_confirming_recoherence_kmer_reads,
:sequences,
:leash_length,
:min_contig_size,
:max_coverage_at_fork,
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from FinishM::Logging

#log

Constructor Details

#initialize(graph, assembly_options = {}) ⇒ SingleEndedAssembler

Create a new assembler given a velvet graph and velvet Sequences object

Assembly options: :max_tip_length: if a path is shorter than this in bp, then it will be clipped from the path. Default 100 :recoherence_kmer: attempt to separate paths by going back to the reads with this larger kmer (requires :seqeunces) :sequences: the sequences of the actual reads, probably a Bio::Velvet::Underground::BinarySequenceStore object :leash_length: don’t continue assembly from nodes farther than this distance (in bp) away :min_coverage_of_start_nodes: only start exploring from nodes with this much coverage :min_contig_size: don’t bother returning contigs shorter than this (default 500bp) :progressbar_io: given an IO object e.g. $stdout, write progress information



35
36
37
38
39
40
41
# File 'lib/assembly/single_ended_assembler.rb', line 35

def initialize(graph, assembly_options={})
  @graph = graph
  @assembly_options = assembly_options
  @assembly_options[:max_tip_length] ||= DEFAULT_MAX_TIP_LENGTH
  @assembly_options[:min_contig_size] ||= DEFAULT_MIN_CONTIG_SIZE
  @assembly_options[:min_confirming_recoherence_kmer_reads] ||= DEFAULT_MIN_CONFIRMING_RECOHERENCE_READS
end

Instance Attribute Details

#assembly_optionsObject

Returns the value of attribute assembly_options.



23
24
25
# File 'lib/assembly/single_ended_assembler.rb', line 23

def assembly_options
  @assembly_options
end

#graphObject

Returns the value of attribute graph.



12
13
14
# File 'lib/assembly/single_ended_assembler.rb', line 12

def graph
  @graph
end

Instance Method Details

#assembleObject

Assemble everything in the graph into OrientedNodeTrail objects. Yields an OrientedNodeTrail if a block is given, otherwise returns an array of found paths. Options for assembly are specified in assembly_options



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/assembly/single_ended_assembler.rb', line 47

def assemble
  paths = []

  # Gather a list of nodes to try starting from
  starting_nodes = gather_starting_nodes
  log.info "Found #{starting_nodes.length} nodes to attempt assembly from"

  seen_nodes = Set.new
  progress = setup_progressbar starting_nodes.length

  # For each starting node, start the assembly process
  dummy_trail = Bio::Velvet::Graph::OrientedNodeTrail.new
  starting_nodes.each do |start_node|
    log.debug "Trying to assemble from #{start_node.node_id}" if log.debug?

    # If we've already covered this node, don't try it again
    if seen_nodes.include?([start_node.node_id, Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST]) or
      seen_nodes.include?([start_node.node_id, Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST])
      log.debug "Already seen this node, not inspecting further" if log.debug?
      next
    end

    # first attempt to go forward as far as possible, then reverse the path
    # and continue until cannot go farther
    reversed_path_forward = find_beginning_trail_from_node(start_node, seen_nodes)
    if reversed_path_forward.nil?
      log.debug "Could not find forward path from this node, giving up" if log.debug?
      next
    end
    # Have we already seen this path before?
    #TODO: add in recoherence logic here
    if seen_last_in_path?(reversed_path_forward, seen_nodes)
      log.debug "Already seen the last node of the reversed path forward: #{reversed_path_forward.trail[-1].to_shorthand}, giving up" if log.debug?
      next
    end
    # Assemble ahead again
    log.debug "reversed_path_forward: #{reversed_path_forward.to_shorthand}" if log.debug?
    path, just_visited_onodes = assemble_from(reversed_path_forward)

    # Remove nodes that have already been seen to prevent duplication
    log.debug "Before removing already seen nodes the second time, path was #{path.length} nodes long" if log.debug?
    remove_seen_nodes_from_end_of_path(path, seen_nodes)
    log.debug "After removing already seen nodes the second time, path was #{path.length} nodes long" if log.debug?

    # Add the now seen nodes to the list
    just_visited_onodes.each do |onode_settable|
      seen_nodes << onode_settable
    end

    # Record which nodes have already been visited, so they aren't visited again
    seen_nodes.merge just_visited_onodes
    unless progress.nil?
      if @assembly_options[:min_coverage_of_start_nodes]
        # TODO: this could be better by progress += (starting_nodes_just_visited.length)
        progress.increment
      else
        progress.progress += just_visited_onodes.length
      end
    end

    if path.length_in_bp < @assembly_options[:min_contig_size]
      log.debug "Path length (#{path.length_in_bp}) less than min_contig_size (#{@assembly_options[:min_contig_size] }), not recording it" if log.debug?
      next
    end
    log.debug "Found a seemingly legitimate path #{path.to_shorthand}" if log.debug?
    if block_given?
      yield path
    else
      paths.push path
    end
  end
  progress.finish unless progress.nil?

  return paths
end

#assemble_from(initial_path, visited_onodes = Set.new) ⇒ Object

Assemble considering reads all reads as single ended. Options: :max_tip_length: if a path is shorter than this in bp, then it will be clipped from the path. Default 100 :recoherence_kmer: attempt to separate paths by going back to the reads with this larger kmer :leash_length: don’t continue assembly from nodes farther than this distance (in bp) away



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
# File 'lib/assembly/single_ended_assembler.rb', line 248

def assemble_from(initial_path, visited_onodes=Set.new)
  options = @assembly_options

  recoherencer = Bio::AssemblyGraphAlgorithms::SingleCoherentPathsBetweenNodesFinder.new

  path = initial_path.copy
  #visited_onodes = Set.new
  initial_path[0...-1].each do |onode| #Add all except the last node to already seen nodes list
    visited_onodes << onode.to_settable
  end

  dummy_trail = Bio::Velvet::Graph::OrientedNodeTrail.new
  oneighbours = nil
  while true
    log.debug "Now assembling from #{path[-1].to_shorthand}" if log.debug?
    if visited_onodes.include?(path[-1].to_settable)
      log.debug "Found circularisation in path, going no further" if log.debug?
      break
    else
      visited_onodes << path[-1].to_settable
    end

    if options[:leash_length] and path.length_in_bp-@graph.hash_length > options[:leash_length]
      log.debug "Beyond leash length, going to further with assembly" if log.debug?
      break
    end

    oneighbours = path.neighbours_of_last_node(@graph)
    if oneighbours.length == 0
      log.debug "Found a dead end, last node is #{path[-1].to_shorthand}" if log.debug?
      break

    elsif oneighbours.length == 1
      to_add = oneighbours[0]
      log.debug "Only one way to go, so going there, to #{to_add.to_shorthand}" if log.debug?
      path.add_oriented_node to_add

    else
      # Reached a fork (or 3 or 4-fork), which way to go?

      # Remove neighbours that are short tips
      oneighbours, visiteds = remove_tips(oneighbours, @assembly_options[:max_tip_length])
      visiteds.each do |onode_settable|
        visited_onodes << onode_settable
      end

      if oneighbours.length == 0
        log.debug "Found a dead end at a fork, last node is #{path[-1].to_shorthand}" if log.debug?
        break
      elsif oneighbours.length == 1
        log.debug "Clipped short tip(s) off, and then there was only one way to go" if log.debug?
        path.add_oriented_node oneighbours[0]
      elsif options[:recoherence_kmer].nil?
        if log.debug?
          neighbours_string = oneighbours.collect do |oneigh|
            oneigh.to_shorthand
          end.join(' or ')
          log.debug "Came across what appears to be a legitimate fork to nodes #{neighbours_string} and no recoherence kmer given, so giving up" if log.debug?
        end
        break
      else
        unless options[:recoherence_kmer].nil?
          log.debug "Attempting to resolve fork by recoherence" if log.debug?
          oneighbours.select! do |oneigh|
            dummy_trail.trail = path.trail+[oneigh]
            recoherencer.validate_last_node_of_path_by_recoherence(
              dummy_trail,
              options[:recoherence_kmer],
              options[:sequences],
              options[:min_confirming_recoherence_kmer_reads]
              )
          end
        end
        if oneighbours.length == 0
          log.debug "no neighbours passed recoherence, giving up" if log.debug?
          break
        elsif oneighbours.length == 1
          log.debug "After recoherence there's only one way to go, going there"
          path.add_oriented_node oneighbours[0]
        elsif options[:max_coverage_at_fork]
          oneighbours.select! do |oneigh|
            oneigh.node.coverage <= options[:max_coverage_at_fork]
          end
          log.debug "Found #{oneighbours.length} neighbours after removing nodes over max coverage" if log.debug?

          if oneighbours.length == 1
            log.debug "After removing too much coverage neighbours there's only one way to go, going there"
            path.add_oriented_node oneighbours[0]
          else
            log.debug "After removing max coverage nodes, #{oneighbours.length} neighbours found (#{oneighbours.collect{|o| o.to_shorthand}.join(",") }), giving up" if log.debug?
            break
          end


        else
          log.debug "Still forked after recoherence (to #{oneighbours.collect{|on| on.to_shorthand}.join(' & ') }), so seems to be a legitimate fork, giving up" if log.debug?
          break
        end
      end
    end
  end

  visited_onodes << path[-1].to_settable

  return path, visited_onodes
end

#find_beginning_trail_from_node(node, previously_seen_nodes) ⇒ Object

Given a node, return a path that does not include any short tips, or nil if none is connected to this node. With this path, you can explore forwards. This isn’t very clear commenting, but I’m just making this stuff up



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/assembly/single_ended_assembler.rb', line 164

def find_beginning_trail_from_node(node, previously_seen_nodes)
  onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new
  onode.node = node
  onode.first_side = Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST #go backwards first, because the path will later be reversed
  dummy_trail = Bio::Velvet::Graph::OrientedNodeTrail.new
  dummy_trail.trail = [onode]

  find_node_from_non_short_tip = lambda do |dummy_trail|
    # go all the way forwards
    path, visited_nodes = assemble_from(dummy_trail)

    # Remove already seen nodes from the end of the trail, because
    # they are already included in other paths and this shows
    # up as duplicated contig stretches and this is not correct
    log.debug "Before removing already seen nodes the first time, path was #{path.length} nodes long" if log.debug?
    remove_seen_nodes_from_end_of_path(path, previously_seen_nodes)
    log.debug "After removing already seen nodes the first time, path was #{path.length} nodes long" if log.debug?

    # reverse the path
    path.reverse!
    # peel back up we aren't in a short tip (these lost nodes might be
    # re-added later on)
    cannot_remove_any_more_nodes = false
    log.debug "Before pruning back, trail is #{path.to_shorthand}" if log.debug?
    is_tip, whatever = is_short_tip?(path[-1])
    while is_tip
      if path.length == 1
        cannot_remove_any_more_nodes = true
        break
      end
      path.delete_at(path.length-1)
      log.debug "After pruning back, trail is now #{path.to_shorthand}" if log.debug?
      is_tip, whatever = is_short_tip?(path[-1])
    end

    if cannot_remove_any_more_nodes
      nil
    else
      path
    end
  end

  log.debug "Finding nearest find_connected_node_on_a_path #{node.node_id}" if log.debug?
  if !is_short_tip?(onode)[0]
    log.debug "fwd direction not a short tip, going with that" if log.debug?
    path = find_node_from_non_short_tip.call(dummy_trail)
    if !path.nil?
      return path
    end
  end

  log.debug "rev direction is short tip, now testing reverse" if log.debug?
  onode.reverse!
  if is_short_tip?(onode)[0]
    log.debug "short tip in both directions, there is no good neighbour" if log.debug?
    #short tip in both directions, so not a real contig
    return nil
  else
    log.debug "reverse direction not a short tip, going with that" if log.debug?
    return find_node_from_non_short_tip.call(dummy_trail)
  end
end

#find_tip_distance(oriented_node, max_tip_length) ⇒ Object

The workhorse function of is_short_tip?



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
# File 'lib/assembly/single_ended_assembler.rb', line 405

def find_tip_distance(oriented_node, max_tip_length)
  stack = DS::Stack.new
  first = MaxDistancedOrientedNode.new
  first.onode = oriented_node
  first.distance = oriented_node.node.length_alone
  stack.push first

  cache = {}
  max_dist = first.distance

  while current_max_distanced_onode = stack.pop
    if current_max_distanced_onode.distance > max_tip_length
      return false, current_max_distanced_onode.distance, []
    end

    max_dist = [max_dist, current_max_distanced_onode.distance].max

    current_max_distanced_onode.onode.next_neighbours(@graph).each do |oneigh|
      neighbour_distance = current_max_distanced_onode.distance + oneigh.node.length_alone
      next if cache[oneigh.to_settable] and cache[oneigh.to_settable] >= neighbour_distance
      distanced_node = MaxDistancedOrientedNode.new
      distanced_node.onode = oneigh
      distanced_node.distance = neighbour_distance
      log.debug "The distance of #{distanced_node.onode.node_id} is at least #{neighbour_distance}" if log.debug?
      cache[oneigh.to_settable] = neighbour_distance
      stack.push distanced_node
    end
  end

  log.debug "Found insufficient max tip length #{max_dist} for #{oriented_node}" if log.debug?
  return true, max_dist, cache.collect{|donode| donode[0]}
end

#gather_starting_nodesObject



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/assembly/single_ended_assembler.rb', line 127

def gather_starting_nodes
  if @assembly_options[:min_coverage_of_start_nodes] or @assembly_options[:min_length_of_start_nodes]
    starting_nodes = []
    graph.nodes.each do |node|
      if (@assembly_options[:min_coverage_of_start_nodes].nil? or
        node.coverage >= @assembly_options[:min_coverage_of_start_nodes]) and
        (@assembly_options[:min_length_of_start_nodes].nil? or
        node.length_alone >= @assembly_options[:min_length_of_start_nodes])

        starting_nodes.push node
      end
    end
    return starting_nodes
  else
    return graph.nodes
  end
end

#is_short_tip?(oriented_node) ⇒ Boolean

Returns false iff there is a path longer than max_tip_length starting at the given oriented_node. Currently works as a depth first search, which may or may not be optimal

Returns:

  • (Boolean)


396
397
398
399
400
# File 'lib/assembly/single_ended_assembler.rb', line 396

def is_short_tip?(oriented_node)
  max_tip_length = @assembly_options[:max_tip_length]
  is_tip, max_distance, visited_onodes = find_tip_distance(oriented_node, max_tip_length)
  return is_tip, visited_onodes
end

#remove_seen_nodes_from_end_of_path(path, seen_nodes) ⇒ Object



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/assembly/single_ended_assembler.rb', line 227

def remove_seen_nodes_from_end_of_path(path, seen_nodes)
  log.debug "Removing from the end of the path #{path.to_shorthand} any nodes in set of size #{seen_nodes.length}" if log.debug?
  while !path.trail.empty?
    last_node_index = path.length-1
    last_node = path[last_node_index]

    if seen_nodes.include?([last_node.node_id, Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST]) or
      seen_nodes.include?([last_node.node_id, Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST])
      path.trail.delete_at(last_node_index)
    else
      # Last node is not previously seen, chop no further.
      break
    end
  end
  return path
end

#remove_tips(oriented_neighbours, tip_distance) ⇒ Object

Given a list of possibilities for neighbours of a node, return the neighbour(s) that are not short tips, or the longest of the short tips if all are tips. Also return an enumerable of nodes visited from the cut off short tips



359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# File 'lib/assembly/single_ended_assembler.rb', line 359

def remove_tips(oriented_neighbours, tip_distance)
  return [], [] if oriented_neighbours.empty?

  neighbours_and_triples = oriented_neighbours.collect do |oneigh|
    [
      oneigh,
      find_tip_distance(oneigh, tip_distance)
      ]
  end
  non_tips, tips = neighbours_and_triples.partition{|nt| nt[1][0] == false}

  visiteds = Set.new
  process_tip = lambda do |tip|
    visiteds << tip[0].to_settable
    tip[1][2].each {|v| visiteds << v}
  end

  if non_tips.length > 0
    tips.each do |tip|
      process_tip.call tip
    end
    return non_tips.collect{|t| t[0]}, visiteds
  else
    # no long distances here. Just go with the longest path
    best_tip = tips.max{|nt| nt[1][1]}
    tips.each do |tip|
      unless tip == best_tip
        process_tip.call tip
      end
    end
    return [best_tip[0]], visiteds
  end
end

#seen_last_in_path?(path, seen_nodes) ⇒ Boolean

Returns:

  • (Boolean)


123
124
125
# File 'lib/assembly/single_ended_assembler.rb', line 123

def seen_last_in_path?(path, seen_nodes)
  seen_nodes.include?(path[-1].to_settable)
end

#setup_progressbar(num_nodes) ⇒ Object



145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/assembly/single_ended_assembler.rb', line 145

def setup_progressbar(num_nodes)
  progress = nil
  if @assembly_options[:progressbar_io]
    progress = ProgressBar.create(
      :title => "Assembly",
      :format => '%a %bᗧ%i %p%% %E %t',
      :progress_mark  => ' ',
      :remainder_mark => '',
      :total => num_nodes,
      :output => @assembly_options[:progressbar_io]
    )
  end
  return progress
end