101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
|
# File 'lib/finishm/finisher.rb', line 101
def run(options, argv)
pooled_reads_filename = 'pooled_sampled_reads.fasta' if options[:already_patterned_reads] pooled_reads_filename = options[:already_patterned_reads]
else
desired_pattern = KmerAbundancePattern.new
desired_pattern.parse_from_human(options[:pattern])
if options[:reads_files].length != desired_pattern.length
raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
end
input_file = File.open options[:kmer_multiple_abundance_file]
csv = CSV.new(input_file, :col_sep => ' ')
whitelist_kmers = []
blacklist_kmers = []
csv.each do |row|
max_i = row.length - 2 if max_i.nil?
kmer = row[0]
counts = row[1...row.length].collect{|s| s.to_i}
this_pattern = []
counts.each_with_index do |count, i|
if count > options[:upper_threshold]
this_pattern[i] = true
elsif count < options[:lower_threshold]
this_pattern[i] = false
else
this_pattern[i] = '-'
end
end
if desired_pattern.consistent_with? this_pattern
whitelist_kmers.push row[0]
else
blacklist_kmers.push row[0]
end
end
log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
unless whitelist_kmers.length > 0
log.error "No kmers found that satisfy the given pattern, exiting.."
exit 1
end
File.open 'whitelist', 'w' do |white| white.puts whitelist_kmers.join("\n")
white.close
File.open('black','w') do |black|
black.puts blacklist_kmers.join("\n")
black.close
threadpool = []
sampled_read_files = []
log.info "Extracting reads that contain suitable kmers"
options[:reads_files].each_with_index do |file, i|
next unless desired_pattern[i]
sampled = File.basename(file)+'.sampled_reads.fasta'
sampled_read_files.push sampled
grep_path = "#{ ENV['HOME'] }/git/priner/bin/read_selection_by_kmer " if options[:min_leftover_length]
grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
end
thr = Thread.new do
grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
log.debug "Running cmd: #{grep_cmd}"
status, stdout, stderr = systemu grep_cmd
log.debug stderr
raise unless status.exitstatus == 0
log.debug "Finished extracting reads from #{file}"
end
threadpool.push thr
end
threadpool.each do |thread| thread.join; end
log.info "Finished extracting reads for sampling. Now pooling sampled reads"
pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
log.debug "Running cmd: #{pool_cmd}"
status, stdout, stderr = systemu pool_cmd
raise stderr if stderr != ''
raise unless status.exitstatus == 0
end
end
end
log.info "Extracting dummy reads from the ends of contigs to use as anchors"
start_contig = options[:start_contig]
end_contig = options[:end_contig]
if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
end
if [start_contig.length, end_contig.length].min < options[:contig_end_length]
log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
exit 1
end
probe_sequences = [
start_contig[start_contig.length-options[:contig_end_length]...start_contig.length],
Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]]).reverse_complement.to_s
]
read_input = Bio::FinishM::ReadInput.new
read_input.fasta_singles = [pooled_reads_filename]
finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
graph = finishm_graph.graph
start_node = finishm_graph.probe_nodes[0]
start_node_forward = finishm_graph.probe_node_directions[0]
end_node = finishm_graph.probe_nodes[1]
end_node_forward = finishm_graph.probe_node_directions[1]
log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
log.info "Removing nodes unconnected to either the start or the end from the graph.."
original_num_nodes = graph.nodes.length
original_num_arcs = graph.arcs.length
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
filter.remove_unconnected_nodes(graph, [start_node, end_node])
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
viser = Bio::Assembly::ABVisualiser.new
log.info "Preparing GraphViz object for output"
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
if options[:output_graph_png]
log.info "Converting assembly to a graphviz PNG #{options[:output_graph_png] }"
gv.output :png => options[:output_graph_png], :use => :neato
end
if options[:output_graph_svg]
log.info "Converting assembly to a graphviz SVG #{options[:output_graph_svg] }"
gv.output :svg => options[:output_graph_svg], :use => :neato
end
if options[:output_graph_dot]
log.info "Converting assembly to a graphviz DOT #{options[:output_graph_dot] }"
gv.output :dot => options[:output_graph_dot]
end
end
log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new
trails.each_with_index do |trail, i|
log.debug "Before attachment to the contig, sequence of the trail was #{trail.sequence}" if log.debug?
acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
acon.start_probe_read_id = 1
acon.end_probe_read_id = 2
acon.start_probe_node = start_node
acon.end_probe_node = end_node
acon.start_probe_contig_offset = options[:contig_end_length]
acon.end_probe_contig_offset = options[:contig_end_length]
acon.paths = [trail]
log.debug "AnchoredConnection object to print for this trail: #{acon.inspect}" if log.debug?
puts ">trail#{i+1}"
puts printer.one_connection_between_two_contigs(
finishm_graph.graph,
probe_sequences[0],
acon,
probe_sequences[1])
end
end
|