Class: Bio::FinishM::Visualise

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/finishm/visualise.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{
:min_adjoining_reads => 2,
:max_adjoining_node_coverage => 300,
:graph_search_leash_length => 20000,
:interesting_probes => nil,
:max_nodes => 50,
:contig_end_length => 200
}

Instance Method Summary collapse

Methods included from Logging

#log

Instance Method Details

#add_options(optparse_object, options) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/finishm/visualise.rb', line 13

def add_options(optparse_object, options)
  options.merge! DEFAULT_OPTIONS
  optparse_object.banner = "\nUsage: finishm visualise --assembly-??? <output_visualisation_file>

  Visualise an assembly graph
  \n\n"

  optparse_object.separator "Output visualisation formats (one or more of these must be used)"
  add_visualisation_options(optparse_object, options)

  optparse_object.separator "Input genome information"
  optparse_object.separator "\nIf an assembly is to be done, there must be some definition of reads:\n\n" #TODO improve this help
  Bio::FinishM::ReadInput.new.add_options(optparse_object, options)

  optparse_object.separator "\nOptional graph-exploration arguments:\n\n"
  add_scaffold_options(optparse_object, options)
  add_probe_options(optparse_object, options)

  optparse_object.separator "\nOptional graph-related arguments:\n\n"
  Bio::FinishM::GraphGenerator.new.add_options(optparse_object, options)
end

#add_probe_options(optparse_object, options) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/finishm/visualise.rb', line 88

def add_probe_options(optparse_object, options)
  optparse_object.on("--probe-ids PROBE_IDS", Array, "explore from these probe IDs in the graph (comma separated). probe ID is the ID in the velvet Sequence file. See also --leash-length [default: don't start from a node, explore the entire graph]") do |arg|
    options[:interesting_probes] = arg.collect do |read|
      read_id = read.to_i
      if read_id.to_s != read or read_id.nil? or read_id < 1
        raise "Unable to parse probe ID #{read}, from #{arg}, cannot continue"
      end
      read_id
    end
  end
  optparse_object.on("--probe-ids-file PROBE_IDS_FILE", String, "explore from the probe IDs given in the file (1 probe ID per line). See also --leash-length [default: don't start from a node, explore the entire graph]") do |arg|
    raise "Cannot specify both --probe-ids and --probe-ids-file sorry" if options[:interesting_probes]
    options[:interesting_probes] = []
    log.info "Reading probe IDs from file: `#{arg}'"
    File.foreach(arg) do |line|
      line.strip!
      next if line == '' or line.nil?
      read_id = line.to_i
      if read_id.to_s != line or read_id < 1 or read_id.nil?
        raise "Unable to parse probe ID #{line}, from file #{arg}, cannot continue"
      end
      options[:interesting_probes].push read_id
    end
    log.info "Read #{options[:interesting_probes].length} probes in"
  end
  optparse_object.on("--probe-names-file PROBE_NAMES_FILE", String, "explore from the probe names (i.e. the first word in the fasta/fastq header) given in the file (1 probe name per line). See also --leash-length [default: don't start from a node, explore the entire graph]") do |arg|
    raise "Cannot specify any two of --probe-names-file, --probe-ids and --probe-ids-file sorry" if options[:interesting_probes]
    options[:interesting_probe_names] = []
    log.info "Reading probe names from file: `#{arg}'"
    File.foreach(arg) do |line|
      line.strip!
      next if line == '' or line.nil?
      options[:interesting_probe_names].push line.split(/\s/)[0]
    end
    log.info "Read #{options[:interesting_probe_names].length} probes names in"
  end
  optparse_object.on("--probe-to-node-map FILE", String, "Output a tab separated file containing the read IDs and their respective node IDs [default: no output]") do |arg|
    options[:probe_to_node_map] = arg
  end
  optparse_object.on("--node-ids NODE_IDS", Array, "explore from these nodes in the graph (comma separated). Node IDs are the nodes in the velvet graph. See also --leash-length [default: don't start from a node, explore the entire graph]") do |arg|
    options[:interesting_nodes] = arg.collect do |read|
      node_id = read.to_i
      if node_id.to_s != read or node_id.nil? or node_id < 1
        raise "Unable to parse node ID #{read}, from #{arg}, cannot continue"
      end
      node_id
    end
  end
  optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this far and not much more [default: unused unless --probe-ids or --nodes is specified, otherwise #{options[:graph_search_leash_length] }]") do |arg|
    options[:graph_search_leash_length] = arg
  end
  optparse_object.on("--max-nodes NUM", Integer, "Maximum number of nodes to explore out from each probe node, or 0 for no maximum [default: #{options[:max_nodes] }]") do |arg|
    if arg==0
      options[:max_nodes] = nil
    else
      options[:max_nodes] = arg
    end
  end
end

#add_scaffold_options(optparse_object, options) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/finishm/visualise.rb', line 63

def add_scaffold_options(optparse_object, options)
  optparse_object.on("--genomes FASTA_1[,FASTA_2...]", Array, "Fasta files of genomes used in the assembly. Required if --scaffolds is given [default: unused]") do |arg|
    options[:assembly_files] = arg
  end
  optparse_object.on("--scaffolds SIDE_1[,SIDE_2...]", Array, "explore from these scaffold ends e.g 'contig1s' for the start of contig1, 'contig1e' for the end of contig1, and 'contig1,contig3e' for both sides of contig1 and the end of contig3 [default: unused]") do |arg|
    options[:scaffold_sides] = arg.collect do |side|
      if side.match(/[se]$/)
        side
      else
        ["#{side}s","#{side}e"]
      end
    end.flatten
  end
  optparse_object.on("--overhang NUM", Integer, "Start assembling this far from the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
    options[:contig_end_length] = arg.to_i
  end
end

#add_visualisation_options(optparse_object, options) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
# File 'lib/finishm/visualise.rb', line 45

def add_visualisation_options(optparse_object, options)
  optparse_object.on("--assembly-svg PATH", "Output assembly as a SVG file [default: off]") do |arg|
    options[:output_graph_svg] = arg
  end
  optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
    options[:output_graph_png] = arg
  end
  optparse_object.on("--assembly-dot PATH", "Output assembly as a DOT file [default: off]") do |arg|
    options[:output_graph_dot] = arg
  end
end

#create_graphviz_output(finishm_graph, options) ⇒ Object



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'lib/finishm/visualise.rb', line 215

def create_graphviz_output(finishm_graph, options)
  log.info "Converting assembly to a graphviz.."
  gv = Bio::Assembly::ABVisualiser.new.graphviz(finishm_graph.graph, {
    :start_node_ids => options[:start_node_ids],
    :nodes => options[:nodes],
    :end_node_ids => options[:end_node_ids],
    :paired_nodes_hash => options[:paired_nodes_hash],
    :node_id_to_nickname => options[:node_id_to_nickname]
    })

  # Convert gv object to something actually pictorial
  if options[:output_graph_png]
    log.info "Writing PNG #{options[:output_graph_png] }"
    gv.output :png => options[:output_graph_png], :use => :neato
  end
  if options[:output_graph_svg]
    log.info "Writing SVG #{options[:output_graph_svg] }"
    gv.output :svg => options[:output_graph_svg], :use => :neato
  end
  if options[:output_graph_dot]
    log.info "Writing DOT #{options[:output_graph_dot] }"
    gv.output :dot => options[:output_graph_dot], :use => :neato
  end
end

#find_paired_end_linkages(finishm_graph, node_array) ⇒ Object



389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# File 'lib/finishm/visualise.rb', line 389

def find_paired_end_linkages(finishm_graph, node_array)
  return {} if @finder.nil?

  paired_end_links = {}
  node_array.each do |node|
    paired_end_links[node.node_id] = []
    [Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST,
      Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST].each do |direction|
        onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new(node, direction)

        paired_end_links[node.node_id].push @finder.neighbours(onode).collect{|n| n.node.node_id}.uniq
      end
    paired_end_links[node.node_id].flatten!
  end
  return paired_end_links
end

#generate_graph_from_assembly(read_input, options) ⇒ Object



278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/finishm/visualise.rb', line 278

def generate_graph_from_assembly(read_input, options)
  # Parse the genome fasta file in
  genomes = Bio::FinishM::InputGenome.parse_genome_fasta_files(
    options[:assembly_files],
    options[:contig_end_length],
    options
    )

  # Create hash of contig end name to probe index
  contig_name_to_probe = {}
  genomes.each do |genome|
    genome.scaffolds.each_with_index do |swaff, scaffold_index|
      probes = [
        genome.first_probe(scaffold_index),
        genome.last_probe(scaffold_index)
        ]
      probes.each do |probe|
        key = nil
        if probe.side == :start
          key = "#{probe.contig.scaffold.name}s"
        elsif probe.side == :end
          key = "#{probe.contig.scaffold.name}e"
        else
          raise "Programming error"
        end

        if contig_name_to_probe.key?(key)
          log.error "Encountered multiple contigs with the same name, this might cause problems, so quitting #{key}"
        end
        contig_name_to_probe[key] = probe.index
      end
    end
  end

  # Gather a list of probe indexes that are of interest to the user
  interesting_probe_ids = []
  if options[:scaffold_sides]
    # If looking at specified ends
    nodes_to_start_from = options[:scaffold_sides].collect do |side|
      if probe = contig_name_to_probe[side]
        interesting_probe_ids << probe
      else
        raise "Unable to find scaffold side in given genome: #{side}"
      end
    end
    log.info "Found #{interesting_probe_ids.length} scaffold sides in the assembly of interest"
  else
    # else looking at all the contig ends in all the genomes
    interesting_probe_ids = contig_name_to_probe.values
    log.info "Visualising all #{interesting_probe_ids.length} contig ends in all genomes"
  end

  # Generate the graph
  probe_sequences = genomes.collect{|genome| genome.probe_sequences}.flatten
  finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)

  # Convert probe IDs into node IDs
  interesting_node_ids = interesting_probe_ids.collect do |pid|
    finishm_graph.probe_nodes[pid].node_id
  end.uniq

  # create a nickname hash, id of node to name. Include all nodes even if they weren't specified directly (they only get visualised if they are within leash length of another)
  node_id_to_nickname = {}
  contig_name_to_probe.each do |name, probe|
    key = finishm_graph.probe_nodes[probe].node_id
    if node_id_to_nickname.key?(key)
      node_id_to_nickname[key] += " "+name
    else
      node_id_to_nickname[key] = name
    end
  end

  return finishm_graph, interesting_node_ids, node_id_to_nickname
end

#generate_graph_from_nodes(read_input, options) ⇒ Object



264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/finishm/visualise.rb', line 264

def generate_graph_from_nodes(read_input, options)
  # Looking based on nodes
  if options[:interesting_nodes].length > 5
    log.info "Targeting #{options[:interesting_nodes].length} nodes #{options[:interesting_nodes][0..4].join(', ') }, ..."
  else
    log.info "Targeting #{options[:interesting_nodes].length} node(s) #{options[:interesting_nodes].inspect}"
  end

  finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph([], read_input, options)
  interesting_node_ids = options[:interesting_nodes]

  return finishm_graph, interesting_node_ids
end

#generate_graph_from_probes(read_input, options) ⇒ Object



240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/finishm/visualise.rb', line 240

def generate_graph_from_probes(read_input, options)
    # Looking based on probes
  if options[:interesting_probe_names]
    if options[:interesting_probe_names].length > 5
      log.info "Targeting #{options[:interesting_probe_names].length} probes #{options[:interesting_probe_names][0..4].join(', ') }, ..."
    else
      log.info "Targeting #{options[:interesting_probe_names].length} probes #{options[:interesting_probe_names].inspect}"
    end
    options[:probe_read_names] = options[:interesting_probe_names]
  else
    if options[:interesting_probes].length > 5
      log.info "Targeting #{options[:interesting_probes].length} probes #{options[:interesting_probes][0..4].join(', ') }, ..."
    else
      log.info "Targeting #{options[:interesting_probes].length} probes #{options[:interesting_probes].inspect}"
    end
    options[:probe_reads] = options[:interesting_probes]
  end

  finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph([], read_input, options)
  interesting_node_ids = finishm_graph.probe_nodes.reject{|n| n.nil?}.collect{|node| node.node_id}

  return finishm_graph, interesting_node_ids
end

#get_nodes_within_leash(finishm_graph, node_ids, options = {}) ⇒ Object



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# File 'lib/finishm/visualise.rb', line 353

def get_nodes_within_leash(finishm_graph, node_ids, options={})
  log.info "Finding nodes within the leash length of #{options[:graph_search_leash_length] } with maximum node count #{options[:max_nodes] }.."
  dijkstra = Bio::AssemblyGraphAlgorithms::Dijkstra.new

  @finder = Bio::FinishM::PairedEndNeighbourFinder.new(finishm_graph, 500) #TODO: this hard-coded 100 isn't great here
  @finder.min_adjoining_reads = options[:min_adjoining_reads]
  @finder.max_adjoining_node_coverage = options[:max_adjoining_node_coverage]

  nodes_within_leash_hash = dijkstra.min_distances_from_many_nodes_in_both_directions(
    finishm_graph.graph, node_ids.collect{|n| finishm_graph.graph.nodes[n]}, {
      :ignore_directions => true,
      :leash_length => options[:graph_search_leash_length],
      :max_nodes => options[:max_nodes],
      :neighbour_finder => @finder
      })
  nodes_within_leash = nodes_within_leash_hash.keys.collect{|k| finishm_graph.graph.nodes[k[0]]}
  log.info "Found #{nodes_within_leash.collect{|o| o.node_id}.uniq.length} node(s) within the leash length"

  # These nodes are at the end of the leash - a node is in here iff
  # it has a neighbour that is not in the nodes_within_leash
  node_ids_at_leash = Set.new
  nodes_within_leash_hash.keys.each do |node_and_direction|
    # Add it to the set if 1 or more nieghbours are not in the original set
    node = finishm_graph.graph.nodes[node_and_direction[0]]
    onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new node, node_and_direction[1]
    onode.next_neighbours(finishm_graph.graph).each do |oneigh|
      if !nodes_within_leash_hash.key?(oneigh.to_settable)
        node_ids_at_leash << node_and_direction[0]
        break #it only takes one to be listed
      end
    end
  end

  return nodes_within_leash.uniq, node_ids_at_leash.to_a.uniq
end

#run(options, argv) ⇒ Object



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/finishm/visualise.rb', line 167

def run(options, argv)
  read_input = Bio::FinishM::ReadInput.new
  read_input.parse_options options

  # Generate the assembly graph
  log.info "Reading in or generating the assembly graph"

  if options[:interesting_probes] or options[:interesting_probe_names]
    finishm_graph, interesting_node_ids = generate_graph_from_probes(read_input, options)

    if (options[:interesting_probes] or options[:interesting_probe_names]) and options[:probe_to_node_map]
      write_probe_to_node_map(options[:probe_to_node_map], finishm_graph, options[:interesting_probes])
    end
  elsif options[:interesting_nodes]
    finishm_graph = generate_graph_from_nodes(read_input, options)
    interesting_node_ids = options[:interesting_nodes]
  elsif options[:assembly_files]
    finishm_graph, interesting_node_ids, node_id_to_nickname = generate_graph_from_assembly(read_input, options)
    options[:node_id_to_nickname] = node_id_to_nickname
  else
    # Visualising the entire graph
    finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph([], read_input, options)
  end


  if options[:graph_search_leash_length] and interesting_node_ids
    #log.info "Finding nodes within the leash length of #{options[:graph_search_leash_length] }.."
    nodes_within_leash, node_ids_at_leash = get_nodes_within_leash(finishm_graph, interesting_node_ids, options)
    log.info "Found #{node_ids_at_leash.length} nodes at the end of the #{options[:graph_search_leash_length] }bp leash" if options[:graph_search_leash_length]

    options.merge!({
      :start_node_ids => interesting_node_ids,
      :nodes => nodes_within_leash,
      :end_node_ids => node_ids_at_leash,

    })
  else
    options[:nodes] = finishm_graph.graph.nodes
  end

  # Determine paired-end connections
  log.info "Determining paired-end node connections.."
  paired_end_links = find_paired_end_linkages(finishm_graph, options[:nodes])
  options[:paired_nodes_hash] = paired_end_links

  create_graphviz_output(finishm_graph, options)
end

#validate_argv_length(argv) ⇒ Object



161
162
163
164
165
# File 'lib/finishm/visualise.rb', line 161

def validate_argv_length(argv)
  if argv.length != 0
    return "Dangling argument(s) found e.g. #{argv[0] }"
  end
end

#validate_assembly_options(options) ⇒ Object



154
155
156
157
158
159
# File 'lib/finishm/visualise.rb', line 154

def validate_assembly_options(options)
  # Need reads unless there is already an assembly
  unless options[:previous_assembly] or options[:previously_serialized_parsed_graph_file]
    return Bio::FinishM::ReadInput.new.validate_options(options, [])
  end
end

#validate_options(options, argv) ⇒ Object



35
36
37
38
39
40
41
42
43
# File 'lib/finishm/visualise.rb', line 35

def validate_options(options, argv)
  #TODO: give a better description of the error that has occurred
  #TODO: require reads options
  return validate_argv_length(argv) ||
      validate_visualisation_options(options) ||
      validate_probe_options(options) ||
      validate_assembly_options(options) ||
      validate_scaffold_options(options)
end

#validate_probe_options(options) ⇒ Object



148
149
150
151
152
# File 'lib/finishm/visualise.rb', line 148

def validate_probe_options(options)
  if options[:interesting_probes] and options[:interesting_nodes]
    return "Can only be interested in probes or nodes, not both, at least currently"
  end
end

#validate_scaffold_options(options) ⇒ Object



81
82
83
84
85
86
# File 'lib/finishm/visualise.rb', line 81

def validate_scaffold_options(options)
  # If scaffolds are defined, then probe genomes must also be defined
  if options[:scaffold_sides] and !options[:assembly_files]
    return "If --scaffolds is given, then --genomes must also be given"
  end
end

#validate_visualisation_options(options) ⇒ Object



57
58
59
60
61
# File 'lib/finishm/visualise.rb', line 57

def validate_visualisation_options(options)
  if options[:output_graph_png].nil? and options[:output_graph_svg].nil? and options[:output_graph_dot].nil?
    return "No visualisation output format/file given, don't know how to visualise"
  end
end

#write_probe_to_node_map(probe_to_node_map_file, finishm_graph, names) ⇒ Object

Write to a file probe_to_node_map_file a map that shows the probe ID, which node that probe is on, and the name of the probe



408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/finishm/visualise.rb', line 408

def write_probe_to_node_map(probe_to_node_map_file, finishm_graph, names)
  log.info "Writing probe-to-node map to #{x}.."
  File.open(probe_to_node_map_file,'w') do |f|
    f.puts %w(probe_number probe node direction).join("\t")
    finishm_graph.probe_nodes.each_with_index do |node, i|
      if node.nil?
        f.puts [
          i+1,
          names[i],
          '-',
          '-',
          ].join("\t")
      else
        f.puts [
          i+1,
          names[i],
          node.node_id,
          finishm_graph.probe_node_directions[i] == true ? 'forward' : 'reverse',
          ].join("\t")
      end
    end
  end
end