Class: Bio::AssemblyGraphAlgorithms::SingleCoherentPathsBetweenNodesFinder

Inherits:
Object
  • Object
show all
Includes:
FinishM::Logging
Defined in:
lib/assembly/single_coherent_paths_between_nodes.rb

Defined Under Namespace

Classes: CycleCounter, DualStack, DynamicProgrammingProblem, ProblemSet, ProblemTrailFinder

Constant Summary collapse

SINGLE_BASE_REVCOM =
{
'A'=>'T',
'T'=>'A',
'G'=>'C',
'C'=>'G',
}

Instance Method Summary collapse

Methods included from FinishM::Logging

#log

Instance Method Details

#array_trail_to_settable(trail, recoherence_kmer) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 116

def array_trail_to_settable(trail, recoherence_kmer)
  return trail.last.to_settable if recoherence_kmer.nil?

  cumulative_length = 0
  i = trail.length - 1
  while i >= 0 and cumulative_length < recoherence_kmer
    cumulative_length += trail[i].node.length_alone
    i -= 1
  end
  i += 1
  # 'Return' an array made up of the settables
  to_return = trail[i..-1].collect{|t| t.to_settable}.flatten
  log.debug "'Returning' settable version of path: #{to_return}" if log.debug?
  to_return
end

#find_all_connections_between_two_nodes(graph, initial_path, terminal_oriented_node, leash_length, recoherence_kmer, sequence_hash, options = {}) ⇒ Object

Find all paths between the initial and terminal node in the graph. Don’t search in the graph when the distance in base pairs exceeds the leash length. Recohere reads (singled ended only) in an attempt to remove bubbles.

Options:

  • max_gapfill_paths: the maxmimum number of paths to return. If this maximum is exceeded, an empty solution set is returned



20
21
22
23
24
25
26
27
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 20

def find_all_connections_between_two_nodes(graph, initial_path, terminal_oriented_node,
  leash_length, recoherence_kmer, sequence_hash, options={})

  problems = find_all_problems(graph, initial_path, terminal_oriented_node, leash_length, recoherence_kmer, sequence_hash, options)

  paths = find_paths_from_problems(problems, recoherence_kmer, options)
  return paths
end

#find_all_problems(graph, initial_path, terminal_node, leash_length, recoherence_kmer, sequence_hash, options = {}) ⇒ Object

Options:

:max_explore_nodes: only explore this many nodes, not further.



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 32

def find_all_problems(graph, initial_path, terminal_node, leash_length, recoherence_kmer, sequence_hash, options={})
  # setup dynamic programming cache
  problems = ProblemSet.new

  # setup stack to keep track of initial nodes
  finder = ProblemTrailFinder.new(graph, initial_path)

  #current_oriented_node_trail = Bio::Velvet::Graph::OrientedNodeTrail.new
  #last_number_of_problems_observed_checkpoint = 0

  while current_path = finder.dequeue
    path_length = current_path.length_in_bp
    log.debug "considering #{current_path}, path length: #{path_length}" if log.debug?

    # Have we solved this before? If so, add this path to that solved problem.
    set_key = path_to_settable current_path, recoherence_kmer
    log.debug "Set key is #{set_key}" if log.debug?

    # Unless the path validates, forget it.
    if recoherence_kmer.nil?
      # Continue, assume that it validates if there is no recoherence_kmer
    elsif !validate_last_node_of_path_by_recoherence(current_path, recoherence_kmer, sequence_hash)
      log.debug "Path did not validate, skipping" if log.debug?
      next
    elsif log.debug?
      log.debug "Path validates"
    end

    if current_path.last == terminal_node
      log.debug "last is terminal" if log.debug?
      problems[set_key] ||= DynamicProgrammingProblem.new
      problems[set_key].known_paths ||= []
      problems[set_key].known_paths.push current_path

      problems.terminal_node_keys ||= Set.new
      problems.terminal_node_keys << set_key

    elsif problems[set_key]
      log.debug "Already seen this problem" if log.debug?
      prob = problems[set_key]
      prob.known_paths.push current_path

      # If a lesser min distance is found, then we need to start exploring from the
      # current place again
      if path_length < prob.min_distance
        log.debug "Found a node with min_distance greater than path length.." if log.debug?
        prob.min_distance = path_length
        finder.push_next_neighbours current_path
      end
    elsif !leash_length.nil? and path_length > leash_length
      # we are past the leash length, give up
      log.debug "Past leash length, giving up" if log.debug?
    else
      log.debug "New dynamic problem being solved" if log.debug?
      # new problem being solved here
      problem = DynamicProgrammingProblem.new
      problem.min_distance = path_length
      problem.known_paths.push current_path.copy
      problems[set_key] = problem

      num_done = problems.length
      if num_done > 0 and num_done % 512 == 0
        log.info "So far worked with #{num_done} head node sets, up to distance #{path_length}" if log.info?
      end
      if options[:max_explore_nodes] and num_done > options[:max_explore_nodes]
        log.warn "Explored too many nodes (#{num_done}), giving up.."
        problems = ProblemSet.new
        break
      end

      # explore the forward neighbours
      finder.push_next_neighbours current_path
    end
    log.debug "Priority queue size: #{finder.length}" if log.debug?
  end

  return problems
end

#find_paths_from_problems(problems, recoherence_kmer, options = {}) ⇒ Object

Separate stacks for valid paths and paths which exceed the maximum allowed cycle count. Each backtrack spawns a set of new paths, which are cycle counted. If any cycle is repeated more than max_cycles, the new path is pushed to the max_cycle_stack, otherwise the path is pushed to the main stack. Main stack paths are prioritised. The max_cycle_stack paths must be tracked until cycle repeats in second_part exceed max_cycles, as they can spawn valid paths with backtracking.



266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 266

def find_paths_from_problems(problems, recoherence_kmer, options={})
  max_num_paths = options[:max_gapfill_paths]
  max_num_paths ||= 2196
  max_cycles = options[:max_cycles] || 1

  counter = CycleCounter.new(max_cycles)
  decide_stack = lambda do |to_push|
    if max_cycles < counter.path_cycle_count(to_push.flatten)
      log.debug "Pushing #{to_push.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to secondary stack" if log.debug?
      return true
    else
      log.debug "Pushing #{to_push.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to main stack" if log.debug?
      return false
    end
  end

  stack = DualStack.new &decide_stack
  to_return = Bio::AssemblyGraphAlgorithms::TrailSet.new

  # if there is no solutions to the overall problem then there is no solution at all
  if problems.terminal_node_keys.nil? or problems.terminal_node_keys.empty?
    to_return.trails = []
    return to_return
  end

  # push all solutions to the "ending in the final node" solutions to the stack
  problems.terminal_node_keys.each do |key|
    overall_solution = problems[key]
    first_part = overall_solution.known_paths[0].to_a
    stack.push [first_part, []]
  end

  all_paths_hash = {}
  while path_parts = stack.pop
    log.debug path_parts.collect{|half| half.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') if log.debug?
    first_part = path_parts[0]
    second_part = path_parts[1]

    if first_part.length == 0
      # If we've tracked all the way to the beginning,
      # then there's no need to track further

      # add this solution if required
      # I've had some trouble getting the Ruby Set to work here, but this is effectively the same thing.
      log.debug "Found solution: #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
      key = second_part.hash
      all_paths_hash[key] ||= second_part
    else
      last = first_part.last

      if second_part.include? last
        log.debug "Cycle at node #{last.node_id} detected in previous path #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
        to_return.circular_paths_detected = true
        if max_cycles == 0 or max_cycles < counter.path_cycle_count([last, second_part].flatten)
          log.debug "Not finishing cyclic path with too many repeated cycles." if log.debug?
          next
        end
      end

      paths_to_last = problems[array_trail_to_settable(first_part, recoherence_kmer)].known_paths
      paths_to_last.each do |path|
        stack.push [path[0...(path.length-1)], [last,second_part].flatten]
      end
    end

    # max_num_paths parachute
    # The parachute can kill the search once the main stack exceeds max_gapfill_paths,
    # since all paths on it are valid.
    if !max_num_paths.nil? and (stack.sizes[0] + all_paths_hash.length) > max_num_paths
      log.info "Exceeded the maximum number of allowable paths in this gapfill" if log.info?
      to_return.max_path_limit_exceeded = true
      all_paths_hash = {}
      break
    end
  end

  to_return.trails = all_paths_hash.values
  return to_return
end

#path_to_settable(path, recoherence_kmer) ⇒ Object



111
112
113
114
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 111

def path_to_settable(path, recoherence_kmer)
  log.debug "Making settable a path: #{path}" if log.debug?
  return array_trail_to_settable(path.trail, recoherence_kmer)
end

#sub_kmer_sequence_overlap?(nodes, sequence_hash, min_concurring_reads = 1) ⇒ Boolean

Is there overlap across the given nodes, even if the overlap does not include an entire kmer? nodes: an OrientedNodeTrail. To validate, there must be at least 1 read that spans all of these nodes sequence_hash: Bio::Velvet::Sequence object with the sequences from the reads in the nodes

Returns:

  • (Boolean)


191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 191

def sub_kmer_sequence_overlap?(nodes, sequence_hash, min_concurring_reads=1)
  raise if nodes.length < 3 #should not get here - this is taken care of above
  log.debug "validating by sub-kmer sequence overlap with min #{min_concurring_reads}: #{nodes}" if log.debug?

  # Only reads that are in the second last node are possible, by de-bruijn graph definition.
  candidate_noded_reads = nodes[-2].node.short_reads
  middle_nodes_length = nodes[1..-2].reduce(0){|sum, n| sum += n.node.length}+
    +nodes[0].node.parent_graph.hash_length-1
  log.debug "Found middle nodes length #{middle_nodes_length}" if log.debug?

  num_confirming_reads = 0

  candidate_noded_reads.each do |read|
    # Ignore reads that don't come in at the start of the node
    log.debug "Considering read #{read.inspect}" if log.debug?
    if read.offset_from_start_of_node != 0
      log.debug "Read doesn't start at beginning of node, skipping" if log.debug?
      next
    else
      seq = sequence_hash[read.read_id]
      raise "No sequence stored for #{read.read_id}, programming fail." if seq.nil?

      if read.start_coord == 0
        log.debug "start_coord Insufficient length of read" if log.debug?
        next
      elsif seq.length-read.start_coord-middle_nodes_length < 1
        log.debug "other_side Insufficient length of read" if log.debug?
        next
      end

      # Now ensure that the sequence matches correctly
      # left base, the base from the first node
      first_node = nodes[0].node
      left_base = !(read.direction ^ nodes[-2].starts_at_start?) ?
      SINGLE_BASE_REVCOM[seq[read.start_coord-1]] :
        seq[read.start_coord+middle_nodes_length]
      left_comparison_base = nodes[0].starts_at_start? ?
      first_node.ends_of_kmers_of_twin_node[0] :
        first_node.ends_of_kmers_of_node[0]
      if left_base != left_comparison_base
        log.debug "left comparison base mismatch, this is not a validating read" if log.debug?
        next
      end

      # right base, overlapping the last node
      last_node = nodes[-1].node
      right_base = !(read.direction ^ nodes[-2].starts_at_start?) ?
      seq[read.start_coord+middle_nodes_length] :
        SINGLE_BASE_REVCOM[seq[read.start_coord-1]]
      right_comparison_base = nodes[-1].starts_at_start? ?
      last_node.ends_of_kmers_of_node[0] :
        last_node.ends_of_kmers_of_twin_node[0]
      if right_base != right_comparison_base
        log.debug "right comparison base mismatch, this is not a validating read" if log.debug?
        next
      end

      log.debug "Read validates path"
      num_confirming_reads += 1
      if num_confirming_reads >= min_concurring_reads
        return true #gauntlet passed, this is enough confirmatory reads, and so the path is validated.
      end
    end
  end
  return false #no candidate reads pass
end

#validate_last_node_of_path_by_recoherence(path, recoherence_kmer, sequence_hash, min_concurring_reads = 1) ⇒ Object

Given an OrientedNodeTrail, and an expected number of



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/assembly/single_coherent_paths_between_nodes.rb', line 133

def validate_last_node_of_path_by_recoherence(path, recoherence_kmer, sequence_hash, min_concurring_reads=1)
  #not possible to fail on a 1 or 2 node path, by debruijn graph definition.
  #TODO: that ain't true! If one of the two nodes is sufficiently long, reads may not agree.
  return true if path.length < 3

  # Walk backwards along the path from the 2nd last node,
  # collecting nodes until the length in bp of the nodes is > recoherence_kmer
  collected_nodes = []
  length_of_nodes = lambda do |nodes|
    if nodes.empty?
      0
    else
      hash_offset = nodes[0].node.parent_graph.hash_length-1
      nodes.reduce(hash_offset) do |sum, node|
        sum += node.node.length_alone
      end
    end
  end
  i = path.length-2
  while i >= 0
    collected_nodes.push path.trail[i]
    i -= 1
    # break if the recoherence_kmer doesn't cover
    break if length_of_nodes.call(collected_nodes) + 1 >= recoherence_kmer
  end
  log.debug "validate: Collected nodes: #{collected_nodes}" if log.debug?
  if collected_nodes.length < 2
    log.debug "Only #{collected_nodes.length+1} nodes being tested for validation, so returning validated" if log.debug?
    return true
  end

  # There should be at least 1 read that spans the collected nodes and the last node
  # The trail validates if the above statement is true.
  #TODO: there's a possible 'bug' here in that there's guarantee that the read is overlays the
  # nodes in a consecutive and gapless manner. But I suspect that is unlikely to be a problem in practice.
  final_node = path.trail[-1].node
  possible_reads = final_node.short_reads.collect{|nr| nr.read_id}
  log.debug "validate starting from #{final_node.node_id}: Initial short reads: #{possible_reads.join(',') }" if log.debug?
  collected_nodes.each do |node|
    log.debug "Validating node #{node}" if log.debug?
    current_set = Set.new node.node.short_reads.collect{|nr| nr.read_id}
    possible_reads.select! do |r|
      current_set.include? r
    end
    if possible_reads.length < min_concurring_reads
      log.debug "First line validation failed, now detecting sub-kmer sequence overlap" if log.debug?
      trail_to_validate = path.trail[i+1..-1]
      return sub_kmer_sequence_overlap?(trail_to_validate, sequence_hash, min_concurring_reads)
    end
  end
  log.debug "Found #{possible_reads.length} reads that concurred with validation e.g. #{possible_reads[0]}" if log.debug?
  return true
end