Class: Transfuse::Cluster
- Inherits:
-
Object
- Object
- Transfuse::Cluster
- Defined in:
- lib/transfuse/cluster.rb
Instance Method Summary collapse
- #cd_hit(fasta) ⇒ Object
- #generate_cdhit_command(fasta, out) ⇒ Object
- #generate_vsearch_command(fasta, out) ⇒ Object
-
#initialize(threads, verbose) ⇒ Cluster
constructor
A new instance of Cluster.
- #parse_output(cluster_output) ⇒ Object
- #parse_vsearch_output(cluster_output) ⇒ Object
- #run(fasta) ⇒ Object
- #vsearch(fasta) ⇒ Object
Constructor Details
#initialize(threads, verbose) ⇒ Cluster
Returns a new instance of Cluster.
8 9 10 11 12 13 14 15 16 |
# File 'lib/transfuse/cluster.rb', line 8 def initialize threads, verbose @cdhit = Which::which('cd-hit-est').first raise "cd-hit-est was not in the PATH - please install it" unless @cdhit @vsearch = Which::which('vsearch').first raise "vsearch was not in the PATH - please install it" unless @vsearch @id = "1.00" @threads = threads @verbose = verbose end |
Instance Method Details
#cd_hit(fasta) ⇒ Object
29 30 31 32 33 34 35 36 37 |
# File 'lib/transfuse/cluster.rb', line 29 def cd_hit fasta puts "running cd-hit-est" if @verbose output = "#{File.basename(fasta, File.extname(fasta))}_cdhit.fa" cdhit_cmd = generate_cdhit_command fasta, output puts cdhit_cmd if @verbose cluster = Cmd.new cdhit_cmd cluster.run output return "#{output}.clstr" end |
#generate_cdhit_command(fasta, out) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/transfuse/cluster.rb', line 48 def generate_cdhit_command fasta, out #cd-hit-est -i all.fa -o cd-hit-clusters.txt -c 0.99999 -T 24 -d 100 cmd = "#{@cdhit}" cmd << " -i #{fasta}" cmd << " -o #{out}" cmd << " -c #{@id}" # similarity = number of identical bases / # length of shorter sequences cmd << " -T #{@threads}" cmd << " -n 10" # word length - maybe increase?? cmd << " -d 100" # output name width cmd << " -g 1" # slower but more accurate mode cmd << " -M 8000" # increase memory end |
#generate_vsearch_command(fasta, out) ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/transfuse/cluster.rb', line 62 def generate_vsearch_command fasta, out vsearch = "#{@vsearch}" vsearch << " --cluster_fast #{fasta}" vsearch << " --id #{@id}" vsearch << " --iddef 0" # cd-hit definition of sequence id vsearch << " --qmask none" # no masking vsearch << " --strand both" vsearch << " --uc #{out}" vsearch << " --threads #{@threads}" return vsearch end |
#parse_output(cluster_output) ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/transfuse/cluster.rb', line 74 def parse_output cluster_output puts "parsing cd-hit output #{cluster_output}" if @verbose cluster_id = 0 clusters = {} File.open(cluster_output).each_line do |line| if line =~ />Cluster\ ([0-9]+)/ cluster_id = $1.to_i elsif line =~ /[0-9]+\s+.+nt,\ >(.+)\.\.\.\sat\s([+\-])\/([0-9\.]+)\%/ contig_name = $1 strand = $2 id = $3.to_f clusters[cluster_id] ||= [] clusters[cluster_id] << { :name => contig_name, :strand => strand } elsif line =~ /[0-9]+\s+[0-9]+nt,\s>(.+)\.\.\.\s\*/ contig_name = $1 strand = "+" clusters[cluster_id] ||= [] clusters[cluster_id] << { :name => contig_name, :strand => strand } end end return clusters end |
#parse_vsearch_output(cluster_output) ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/transfuse/cluster.rb', line 97 def parse_vsearch_output cluster_output clusters = {} File.open(cluster_output).each_line do |line| if line.start_with?("S") or line.start_with?("H") cols = line.chomp.split("\t") cluster = cols[1].to_i contig_name = cols[8] clusters[cluster] ||= [] clusters[cluster] << contig_name end end return clusters end |
#run(fasta) ⇒ Object
18 19 20 21 22 23 24 25 26 27 |
# File 'lib/transfuse/cluster.rb', line 18 def run fasta use_cd_hit = false if use_cd_hit output = cd_hit fasta return parse_output output else output = vsearch fasta return parse_vsearch_output output end end |
#vsearch(fasta) ⇒ Object
39 40 41 42 43 44 45 46 |
# File 'lib/transfuse/cluster.rb', line 39 def vsearch fasta puts "running vsearch" if @verbose cluster_output = "#{fasta}.clust" vsearch_cmd = generate_vsearch_command fasta, cluster_output cluster = Cmd.new vsearch_cmd cluster.run cluster_output return cluster_output end |