Class: Bio::FinishM::ConnectionInterpreter

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/assembly/connection_interpreter.rb

Defined Under Namespace

Classes: Connection, PossiblyCircularArray, Probe, Scaffold, UnscaffoldedContig

Instance Method Summary collapse

Methods included from Logging

#log

Constructor Details

#initialize(connections, sequence_ids) ⇒ ConnectionInterpreter

connections is an Enumerable of Probe object , sequences is a hash of name => DNA string



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/assembly/connection_interpreter.rb', line 8

def initialize(connections, sequence_ids)
  @graph = Yargraph::UndirectedGraph.new
  @circular_probes = []
  @sequence_ids = sequence_ids

  # Setup hash of setable to original
  # Assume there is only 1 connection between two contig ends
  @connection_hash = {}
  connections.each do |conn|
    key = conn.to_settable
    raise "Duplicate connections not handled (yet?), found #{conn} => #{key}" if @connection_hash.key?(key)
    @connection_hash[key] = conn
  end

  # Add connections
  connections.each do |conn|
    if conn.probe1.to_settable == conn.probe2.to_settable
      @circular_probes.push con..probe1
    else
      @graph.add_edge conn.probe1.to_settable, conn.probe2.to_settable
    end
  end

  log.debug "Created a graph with #{@graph.vertices.to_a.length} vertices and #{@graph.edges.length} edges" if log.debug?
end

Instance Method Details

#circular_sequencesObject

Return sequences that exclusively connect the start to the end. In particular, return an Array of sequence names



40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/assembly/connection_interpreter.rb', line 40

def circular_sequences
  to_return = []
  connections.each do |conn|
    if conn.probe1.sequence_index == conn.probe2.sequence_index and
      conn.probe1.side != conn.probe2.side and
      @graph.edges[conn.probe1.to_settable].length == 1 and
      @graph.edges[conn.probe2.to_settable].length == 1

      to_return.push conn.probe1.sequence_index
    end
  end
  return to_return
end

#connectionsObject



34
35
36
# File 'lib/assembly/connection_interpreter.rb', line 34

def connections
  @connection_hash.values
end

#doubly_single_contig_connectionsObject

Return an Array of Connection objects that represent edges where there is only a single connection from both side



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/assembly/connection_interpreter.rb', line 57

def doubly_single_contig_connections
  likelies = []

  already_seen_connections = Set.new

  @graph.vertices.each do |v|
    # If there is only 1 connection on both sides, then go with that
    neighbours = @graph.neighbours(v)
    log.debug "Testing connection between #{v} and #{neighbours}"
    if neighbours.length == 1 and @graph.neighbours(neighbours[0]).length == 1
      log.debug "Connection passed the doubly-test" if log.debug?
      neighbour = neighbours[0]

      conn = Connection.new
      conn.probe1 = Probe.new(v)
      conn.probe2 = Probe.new(neighbour)
      settable = conn.to_settable
      # Record the connection unless it is duplicate
      unless already_seen_connections.include?(settable)
        likelies.push @connection_hash[settable]
        already_seen_connections << settable
      end
    end
  end

  return likelies
end

#scaffolds(contig_connections) ⇒ Object

Single linkage cluster the likely_inter_contig_connections and the start to ends for each of the contigs. Assumes



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/assembly/connection_interpreter.rb', line 87

def scaffolds(contig_connections)
  # It is like an (easy)
  # assembly problem because each vertex can only be connected to
  # two others - 1 intra-contig and 1 inter-contig (unless it is circular)
  likelies_edge_set = Yargraph::UndirectedGraph::EdgeSet.new
  contig_connections.each do |conn|
    likelies_edge_set.add_edge conn.probe1.to_settable, conn.probe2.to_settable
  end

  scaffolded_paths = []
  circular_single_contigs = Set.new

  # while there is more elements in the likelies set,
  # 'pop' an arbitrary edge out of the graph
  while starting_edge = likelies_edge_set.pop
    log.debug "starting to scaffold from #{starting_edge}" if log.debug?

    # Ignore likelies that are circular
    if starting_edge[0][0] == starting_edge[1][0]
      log.debug "Not scaffolding contig #{starting_edge[0][0] } since it appears to be circular" if log.debug?
      circular_single_contigs << starting_edge[0][0]
      next
    end

    circular = false

    # go 'left'. Connect the other side of the left.
    lefts = [Probe.new(starting_edge[0])]
    rights = [Probe.new(starting_edge[1])]
    log.debug "rights was #{rights[0].to_s}" if log.debug?
    # while there is another node to the left
    while next_probe = likelies_edge_set[lefts[-1].companion.to_settable].to_a[0]
      next_probe_probe = Probe.new(next_probe)
      companion = lefts[-1].companion

      likelies_edge_set.delete next_probe, companion.to_settable
      if next_probe_probe.companion.to_settable == rights[0].to_settable
        log.debug "Found multi-contig circularity between #{next_probe_probe.companion} and #{rights[0] }" if log.debug?
        circular = true
        break
      end

      lefts.push companion
      lefts.push next_probe_probe
      log.debug "Adding node to the left: #{next_probe} and companion #{companion}" if log.debug?
    end
    # and go right
    while next_probe = likelies_edge_set[rights[-1].companion.to_settable].to_a[0]
      companion = rights[-1].companion
      rights.push companion
      rights.push Probe.new(next_probe)
      log.debug "Adding node to the right: #{next_probe} and companion #{companion}" if log.debug?
      likelies_edge_set.delete next_probe, companion.to_settable
    end

    # Add the left and the right together into one path
    scaffolded_paths.push(
      PossiblyCircularArray.new(
        [lefts[-1].companion]+
          lefts.reverse+
          rights+
          [rights[-1].companion],
        circular)
      )
  end
  if log.debug?
    log.debug "Found #{scaffolded_paths.length} multi-contig scaffold(s):"
    scaffolded_paths.each do |path|
      log.debug "Scaffold: #{path.collect{|e| e.to_s}.join(', ') }"
    end
  end

  # for each scaffolded set, create new scaffold object
  scaffolds = []
  scaffolded_contigs = Set.new
  scaffolded_paths.each do |path|
    raise if path.length % 2 != 0
    scaffold = Scaffold.new
    scaffold.circular = path.circular
    previous_probe = nil
    path.each_with_index do |probe, i|
      if i % 2 == 1
        previous_probe = probe
        next
      end
      contig = UnscaffoldedContig.new
      contig.sequence_index = probe.sequence_index
      if probe.side == :start
        contig.direction = true
      else
        contig.direction = false
      end
      scaffold.contigs ||= []
      unless scaffold.contigs.empty?
        dummy_conn = Connection.new
        dummy_conn.probe1 = previous_probe
        dummy_conn.probe2 = probe
        original_connection = @connection_hash[dummy_conn.to_settable]
        scaffold.gap_lengths.push original_connection.distance
      end
      scaffold.contigs.push contig
      scaffolded_contigs << probe.sequence_index
    end
    scaffolds.push scaffold
  end

  # for each contig that is not in a contig, add as singleton
  @sequence_ids.each do |i|
    unless scaffolded_contigs.include?(i)
      scaff = Scaffold.new
      contig = UnscaffoldedContig.new
      contig.sequence_index = i
      contig.direction = true
      scaff.contigs = [contig]
      if circular_single_contigs.include?(i)
        scaff.circular = true
      else
        scaff.circular = false
      end
      scaffolds.push scaff
    end
  end

  return scaffolds
end

#unconnected_probesObject

Assuming the sequence_ids given in the initialize are the same as the sequence_index



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/assembly/connection_interpreter.rb', line 215

def unconnected_probes
  observed_connections = Set.new
  connections.each do |conn|
    observed_connections << conn.probe1.to_settable
    observed_connections << conn.probe2.to_settable
  end
  to_return = []
  @sequence_ids.each do |index|
    [:start, :end].each do |side|
      probe = Probe.new
      probe.sequence_index = index
      probe.side = side
      unless observed_connections.include?(probe.to_settable)
        to_return.push probe
      end
    end
  end
  return to_return
end

#unconnected_sequencesObject

Return an Array of sequence indices that did not have any connections to any others.



237
238
239
240
241
242
243
244
# File 'lib/assembly/connection_interpreter.rb', line 237

def unconnected_sequences
  observed_sequences = Set.new
  connections.each do |conn|
    observed_sequences << conn.probe1.sequence_index
    observed_sequences << conn.probe2.sequence_index
  end
  return @sequence_ids.to_a - observed_sequences.to_a
end