Class: Bio::FinishM::GraphGenerator

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/assembly/graph_generator.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{
:velvet_kmer_size => 51,
:assembly_coverage_cutoff => 3.5,
}

Instance Method Summary collapse

Methods included from Logging

#log

Instance Method Details

#add_options(option_parser, options) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/assembly/graph_generator.rb', line 12

def add_options(option_parser, options)
  options.merge!(DEFAULT_OPTIONS)
  option_parser.on("--assembly-kmer NUMBER", "when assembling, use this kmer length [default: #{options[:velvet_kmer_size] }]") do |arg|
    options[:velvet_kmer_size] = arg.to_i
  end
  option_parser.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff] }]") do |arg|
    options[:assembly_coverage_cutoff] = arg.to_f
  end
  option_parser.on("--post-assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, implemented after assembly [default: not used]") do |arg|
    options[:post_assembly_coverage_cutoff] = arg.to_f
  end
  option_parser.on("--velvet-directory PATH", "Output assembly intermediate files to this directory [default: use temporary directory, delete afterwards]") do |arg|
    options[:output_assembly_path] = arg
  end
  option_parser.on("--already-assembled-velvet-directory PATH", "If an assembly directory has been specified previously with --velvet-directory, re-use this assembly rather than re-doing the assembly [default: off]") do |arg|
    options[:previous_assembly] = arg
  end
end

#check_probe_sequences(probe_sequences, sequence_store) ⇒ Object

When re-using an assembly, sometimes need to make sure that the probe sequences used previously are the same as what is given this time. Given am Array of probe sequences and a binary_sequence_file, check the probe sequences are the consistent.



301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/assembly/graph_generator.rb', line 301

def check_probe_sequences(probe_sequences, sequence_store)
  return true if probe_sequences.nil?

  probe_sequences.each_with_index do |probe, i|
    log.debug "Checking probe sequence \##{i+1}" if log.debug?
    if sequence_store[i+1].upcase != probe.upcase
      log.error "Probe sequence \##{i+1} has changed - perhaps the wrong velvet assembly directory was specified, or a fresh assembly is required?"
      return false
    end
  end
  log.debug "Presence of #{probe_sequences.length} probe sequences verified"
  return true
end

#generate_graph(probe_sequences, read_inputs, options = {}) ⇒ Object

Generate a ProbedGraph object, given one or more ‘probe sequences’ and metagenomic reads. This is a rather large method, but seems to be approximately repeated in different applications of FinishM, so creating it for DRY purposes.

probe_sequences: DNA sequences (as String objects whose direction points to the outsides of contigs) read_inputs: a ReadInput object, containing the information to feed to velveth

options: :probe_reads: a list of sequence numbers (numbering as per velvet Sequence file) :probe_read_names: a list of sequence names (not IDs) that are probes (convert the names to IDs using the CnyUnifiedSeqNames file). There may not be a one to one correspondence of these read names and the probe reads returned in the ProbedGraph since reads can map to multiple sequence IDs. :velvet_kmer_size: kmer :assembly_coverage_cutoff: coverage cutoff for nodes :post_assembly_coverage_cutoff: apply this coverage cutoff to nodes after parsing assembly :output_assembly_path: write assembly to this directory :previous_assembly: a velvet directory from a previous run of the same probe sequences and reads. (Don’t re-assemble) :use_textual_sequence_file: by default, a binary sequence file is used. Set this true to get velvet to generate the Sequences file :remove_unconnected_nodes: delete nodes from the graph that are not connected to the probe nodes :graph_search_leash_length: when :remove_unconnected_nodes’ing, use this leash length :dont_parse_noded_reads: if true, skip parsing noded reads (ie the positions of the reads in the graph) :dont_parse_reads: if true, skip parsing reads (ie the sequences of the reads themselves)



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# File 'lib/assembly/graph_generator.rb', line 52

def generate_graph(probe_sequences, read_inputs, options={})
  options[:parse_sequence_file] ||= true
  graph = nil
  read_probing_graph = nil
  finishm_graph = Bio::FinishM::ProbedGraph.new

  log.debug "Options for generate_graph: #{options}" if log.debug?

  velvet_binary_folder = File.join(File.dirname(__FILE__),'..','..','ext','src')
  log.debug "Using velvet binary folder #{velvet_binary_folder}" if log.debug?

  velvet_result = nil

  probe_read_ids = nil
  if options[:probe_reads]
    probe_read_ids = options[:probe_reads]
  else
    probe_read_ids = Set.new((1..probe_sequences.length))
  end
  if options[:previous_assembly].nil? #If assembly has not already been carried out
    Tempfile.open('probes.fa') do |tempfile|
      50.times do # Do 50 times to make sure that velvet doesn't throw out parts of the graph that contain this contig
        probe_sequences.each_with_index do |probe, i|
          tempfile.puts ">probe#{i}"
          tempfile.puts probe
        end
      end
      tempfile.close
      singles = read_inputs.fasta_singles
      if singles and !singles.empty?
        read_inputs.fasta_singles = [tempfile.path, singles].flatten
      else
        read_inputs.fasta_singles = [tempfile.path]
      end
      log.debug "Inputting probes into the assembly:\n#{File.open(tempfile.path).read}" if log.debug?

      runner = Bio::Velvet::Runner.new
      required_version = '1.2.10-wwood_finishm'
      found_version = runner.binary_version(File.join(velvet_binary_folder, 'velveth'))
      if found_version != required_version
        raise "Detected velvet version incompatible with FinishM: #{found_version}, expected #{required_version} which is available from https://github.com/wwood/velvet (on branch less_clipping)"
      end

      log.info "Assembling sampled reads with velvet"
      raise "Need to specify -cov_cutoff" if options[:assembly_coverage_cutoff].nil?
      raise "Need to specify a kmer size" if options[:velvet_kmer_size].nil?
      # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
      use_binary = options[:use_textual_sequence_file] ? '' : '-create_binary'
      velvet_result = runner.velvet(
        options[:velvet_kmer_size],
        "#{read_inputs.velvet_read_arguments} #{use_binary}",
        "-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff] } -tour_bus no -read_to_node_binary yes",
        :output_assembly_path => options[:output_assembly_path],
        :velveth_path => File.join(velvet_binary_folder, 'velveth'),
        :velvetg_path => File.join(velvet_binary_folder, 'velvetg'),
        )
      if log.debug?
        log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
        log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
        log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
        log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
      end
      log.info "Finished running assembly"
      finishm_graph.velvet_result_directory = velvet_result.result_directory
    end
  else
    log.info "Using previous assembly stored in #{options[:previous_assembly] }"
    velvet_result = Bio::Velvet::Result.new
    velvet_result.result_directory = options[:previous_assembly]
    finishm_graph.velvet_result_directory = velvet_result.result_directory
  end

  # Check that the probe reads given are present in the assembly passed here
  unless options[:dont_parse_reads]
    sequence_store = parse_velvet_binary_reads(velvet_result.result_directory)
    finishm_graph.velvet_sequences = sequence_store
    if !check_probe_sequences(probe_sequences, sequence_store)
      raise "Probe sequences changed since previous velvet assembly!"
    end
  end

  log.info "Parsing the graph output from velvet"
  opts = {
    # noded reads are parsed in via C, if they are wanted at all
    :dont_parse_noded_reads => true
  }
  bio_velvet_graph = Bio::Velvet::Graph.parse_from_file(
    File.join(velvet_result.result_directory, 'LastGraph'),
    opts
    )
  log.info "Finished parsing graph: found #{bio_velvet_graph.nodes.length} nodes and #{bio_velvet_graph.arcs.length} arcs"

  if options[:dont_parse_noded_reads]
    graph = bio_velvet_graph
  else
    log.info "Beginning parse of graph using velvet's parsing C code.."
    read_probing_graph = Bio::Velvet::Underground::Graph.parse_from_file File.join(velvet_result.result_directory, 'LastGraph')
    log.info "Completed velvet code parsing velvet graph"

    # Make the two graphs into a hybrid one
    graph = Bio::FinishM::HybridGraph.new(bio_velvet_graph, read_probing_graph)
  end
  finishm_graph.graph = graph

  # Find the anchor nodes again
  anchor_sequence_ids = probe_read_ids.to_a.sort
  endings = []
  unless probe_read_ids.empty? and options[:probe_read_names].nil? #don't bother trying to find probes if none exists
    # Convert read names to read IDs if required
    if options[:probe_read_names]
      # Probe reads are given as names, not IDs. What are the corresponding probes then?
      entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
        File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
        options[:probe_read_names]
        )
      anchor_sequence_ids = []
      double_counts = 0
      options[:probe_read_names].each do |name| #maintain order of them as they are specified in the original array parameter
        if entries[name].empty?
          raise "Unable to find probe `#{name}' in the probe reads file - was it included in the assembly?"
        elsif entries[name].length > 2
          raise "Found >2 sequences named #{name} in the assembly, being conservative and not continuing"
        else
          entries[name].each do |res|
            anchor_sequence_ids.push res.read_id
          end
          if entries[name].length == 2
            double_counts += 1
            log.debug "Found 2 sequences for #{name}" if log.debug?
          end
        end
      end
      if double_counts > 0
        log.info "#{double_counts} reads were found twice (likely as pairs), including both as probes"
      end
      log.info "Recovered #{anchor_sequence_ids.length} sequences using their names" if log.info?
    end


    # Parse the read to node structure
    log.info "Reading ReadToNode.bin file.." if log.info?
    finishm_graph.read_to_nodes = Bio::FinishM::ReadToNode.new(File.join(velvet_result.result_directory, 'ReadToNode.bin'))

    finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
    log.info "Finding probe nodes in the assembly"
    c_graph_endings = finder.find_probes_from_read_to_node(finishm_graph.graph, finishm_graph.read_to_nodes, anchor_sequence_ids)
    log.debug "Converting probe nodes found in C graph to Ruby analogues and adding to Ruby-parsed graph"
    endings = c_graph_endings.collect do |node_direction_read|
      if node_direction_read.empty?
        # No probe found
        []
      else #found a node.
        #equivalent node
        node = graph.nodes[node_direction_read[0].node_id]
        #equivalent direction
        direction = node_direction_read[1]
        #equivalent noded read
        nr = Bio::Velvet::Graph::NodedRead.new
        #           nr.read_id = read_id
        #           nr.offset_from_start_of_node = row[1].to_i
        #           nr.start_coord = row[2].to_i
        #           nr.direction = current_node_direction
        cnr = node_direction_read[2]
        nr.read_id = cnr.read_id
        nr.offset_from_start_of_node = cnr.offset_from_start_of_node
        nr.start_coord = cnr.start_coord
        nr.direction = direction
        # collect
        [node, direction, nr]
      end
    end
  end
  finishm_graph.probe_nodes = endings.collect{|array| array[0]}
  finishm_graph.probe_node_directions = endings.collect{|array| array[1]}
  finishm_graph.probe_node_reads = endings.collect{|array| array[2]}

  # Check to make sure the probe sequences map to nodes in the graph
  if finishm_graph.completely_probed?
    if log.info?
      found_all = true
      num_found = 0
      finishm_graph.probe_nodes.each_with_index do |probe,i|
        if probe.nil?
          found_all = false
          log.debug "Unable to recover probe ##{i+1}, perhaps this will cause problems, but proceding optimistically"
        else
          num_found += 1
        end
      end
      if found_all
        if finishm_graph.probe_nodes.empty?
          log.debug "No probes specified, so didn't find any"
        else
          log.info "Found all anchoring nodes in the graph."
        end
      else
        log.info "Found #{num_found} of #{finishm_graph.probe_nodes.length} anchoring nodes in the graph, ignoring the rest"
      end
    end
  else
    raise "Unable to find all anchor reads from the assembly, cannot continue. This is probably an error with this script, not you. Probes not found: #{finishm_graph.missing_probe_indices.inspect}"
  end

  if options[:post_assembly_coverage_cutoff]
    log.info "Removing nodes with coverage < #{options[:post_assembly_coverage_cutoff] } from graph.."
    original_num_nodes = graph.nodes.length
    original_num_arcs = graph.arcs.length
    filter = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
    filter.remove_low_coverage_nodes(graph,
      options[:post_assembly_coverage_cutoff],
      :whitelisted_sequences => Set.new(anchor_sequence_ids)
    )
    log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs, leaving #{graph.nodes.length} nodes and #{graph.arcs.length} arcs."
  end

  if options[:remove_unconnected_nodes]
    if options[:graph_search_leash_length]
      log.info "Removing nodes unconnected to probe nodes from the graph using leash #{options[:graph_search_leash_length] }.."
    else
      log.info "Removing nodes unconnected to probe nodes from the graph without using a leash.."
    end
    original_num_nodes = graph.nodes.length
    original_num_arcs = graph.arcs.length
    filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
    filter.remove_unconnected_nodes(
      graph,
      finishm_graph.probe_nodes.reject{|n| n.nil?},
      :leash_length => options[:graph_search_leash_length]
      )
    log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs, leaving #{graph.nodes.length} nodes and #{graph.arcs.length} arcs."
  end

  return finishm_graph
end

#parse_velvet_binary_reads(velvet_result_directory) ⇒ Object

Read in the reads from a velvet result



288
289
290
291
292
293
294
# File 'lib/assembly/graph_generator.rb', line 288

def parse_velvet_binary_reads(velvet_result_directory)
  sequences_file_path = File.join velvet_result_directory, 'CnyUnifiedSeq'
  log.info "Reading in the actual sequences of all reads from #{sequences_file_path}"
  sequences = Bio::Velvet::Underground::BinarySequenceStore.new sequences_file_path
  log.info "Read in #{sequences.length} sequences"
  return sequences
end