Class: Transfuse::Cluster

Inherits:
Object
  • Object
show all
Defined in:
lib/transfuse/cluster.rb

Instance Method Summary collapse

Constructor Details

#initialize(threads, verbose) ⇒ Cluster

Returns a new instance of Cluster.



8
9
10
11
12
13
14
15
16
# File 'lib/transfuse/cluster.rb', line 8

def initialize threads, verbose
  @cdhit = Which::which('cd-hit-est').first
  raise "cd-hit-est was not in the PATH - please install it" unless @cdhit
  @vsearch = Which::which('vsearch').first
  raise "vsearch was not in the PATH - please install it" unless @vsearch
  @id = "1.00"
  @threads = threads
  @verbose = verbose
end

Instance Method Details

#cd_hit(fasta) ⇒ Object



29
30
31
32
33
34
35
36
37
# File 'lib/transfuse/cluster.rb', line 29

def cd_hit fasta
  puts "running cd-hit-est" if @verbose
  output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa"
  cdhit_cmd = generate_cdhit_command fasta, output
  puts cdhit_cmd if @verbose
  cluster = Cmd.new cdhit_cmd
  cluster.run output
  return "#{output}.clstr"
end

#generate_cdhit_command(fasta, out) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/transfuse/cluster.rb', line 48

def generate_cdhit_command fasta, out
  #cd-hit-est -i all.fa  -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100
  cmd = "#{@cdhit}"
  cmd << " -i #{fasta}"
  cmd << " -o #{out}"
  cmd << " -c #{@id}" # similarity = number of identical bases /
                      #              length of shorter sequences
  cmd << " -T #{@threads}"
  cmd << " -n 10" # word length - maybe increase??
  cmd << " -d 100" # output name width
  cmd << " -g 1" # slower but more accurate mode
  cmd << " -M 8000" # increase memory
end

#generate_vsearch_command(fasta, out) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
# File 'lib/transfuse/cluster.rb', line 62

def generate_vsearch_command fasta, out
  vsearch = "#{@vsearch}"
  vsearch << " --cluster_fast #{fasta}"
  vsearch << " --id #{@id}"
  vsearch << " --iddef 0" # cd-hit definition of sequence id
  vsearch << " --qmask none" # no masking
  vsearch << " --strand both"
  vsearch << " --uc #{out}"
  vsearch << " --threads #{@threads}"
  return vsearch
end

#parse_output(cluster_output) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/transfuse/cluster.rb', line 74

def parse_output cluster_output
  puts "parsing cd-hit output #{cluster_output}" if @verbose
  cluster_id = 0
  clusters = {}
  File.open(cluster_output).each_line do |line|
    if line =~ />Cluster\ ([0-9]+)/
      cluster_id = $1.to_i
    elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/
      contig_name = $1
      strand = $2
      id = $3.to_f
      clusters[cluster_id] ||= []
      clusters[cluster_id] << { :name => contig_name, :strand => strand }
    elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/
      contig_name = $1
      strand = "+"
      clusters[cluster_id] ||= []
      clusters[cluster_id] << { :name => contig_name, :strand => strand }
    end
  end
  return clusters
end

#parse_vsearch_output(cluster_output) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/transfuse/cluster.rb', line 97

def parse_vsearch_output cluster_output
  clusters = {}
  File.open(cluster_output).each_line do |line|
    if line.start_with?("S") or line.start_with?("H")
      cols = line.chomp.split("\t")
      cluster = cols[1].to_i
      contig_name = cols[8]
      clusters[cluster] ||= []
      clusters[cluster] << contig_name
    end
  end
  return clusters
end

#run(fasta) ⇒ Object



18
19
20
21
22
23
24
25
26
27
# File 'lib/transfuse/cluster.rb', line 18

def run fasta
  use_cd_hit = false
  if use_cd_hit
    output = cd_hit fasta
    return parse_output output
  else
    output = vsearch fasta
    return parse_vsearch_output output
  end
end

#vsearch(fasta) ⇒ Object



39
40
41
42
43
44
45
46
# File 'lib/transfuse/cluster.rb', line 39

def vsearch fasta
  puts "running vsearch" if @verbose
  cluster_output = "#{fasta}.clust"
  vsearch_cmd = generate_vsearch_command fasta, cluster_output
  cluster = Cmd.new vsearch_cmd
  cluster.run cluster_output
  return cluster_output
end