Class: Cdhit

Inherits:
Object
  • Object
show all
Defined in:
lib/full_lengther_next/cdhit.rb

Constant Summary collapse

NAME =
0
COMMENTS =
1
SEQ_FASTA =
2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(fasta_file, clust_file) ⇒ Cdhit

Returns a new instance of Cdhit.



35
36
37
38
39
# File 'lib/full_lengther_next/cdhit.rb', line 35

def initialize(fasta_file, clust_file)
	@clusters = []
	@sequence_hash_fasta=hash_fasta(fasta_file)
	cd_hit_clusters(clust_file)
end

Instance Attribute Details

#clustersObject

Returns the value of attribute clusters.



29
30
31
# File 'lib/full_lengther_next/cdhit.rb', line 29

def clusters
  @clusters
end

#sequence_hash_fastaObject

Returns the value of attribute sequence_hash_fasta.



29
30
31
# File 'lib/full_lengther_next/cdhit.rb', line 29

def sequence_hash_fasta
  @sequence_hash_fasta
end

Instance Method Details

#cd_hit_clusters(clust_file) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/full_lengther_next/cdhit.rb', line 111

def cd_hit_clusters(clust_file)
	#require 'bio-cd-hit-report'
	report = Bio::CdHitReport.new(clust_file)
	report.each_cluster do |cluster|
		clust=[]
		cluster.data.each do |member|
			name, master = parse_member(member)
			hash_seq = @sequence_hash_fasta[name]
			sequence = Seq.new(hash_seq[NAME], hash_seq[COMMENTS], hash_seq[SEQ_FASTA], master)
			clust << sequence
		end
		@clusters << clust
	end
end

#each_clusterObject



41
42
43
44
45
# File 'lib/full_lengther_next/cdhit.rb', line 41

def each_cluster
	@clusters.each do |cluster|
		yield cluster
	end
end

#get_all_masterObject



92
93
94
95
96
97
98
# File 'lib/full_lengther_next/cdhit.rb', line 92

def get_all_master
	master = []
	each_cluster{|cluster|
		master << get_master(cluster)
	}
	return master
end

#get_master(cluster) ⇒ Object



87
88
89
90
# File 'lib/full_lengther_next/cdhit.rb', line 87

def get_master(cluster)
	master= cluster.select{|seq| seq.master}.first
	return master
end

#get_sp(cluster) ⇒ Object



100
101
102
103
104
105
106
107
108
# File 'lib/full_lengther_next/cdhit.rb', line 100

def get_sp(cluster)
	master=cluster.select{|seq| seq.db == 'sp'}
	if !master.empty?
		master=master.first
	else
		master=nil
	end
	return master
end

#hash_fasta(file) ⇒ Object



139
140
141
142
143
144
145
146
147
# File 'lib/full_lengther_next/cdhit.rb', line 139

def hash_fasta(file)
	sequence_hash_fasta={}
	fqr=FastaQualFile.new(file)
	fqr.each do |name,seq_fasta,comments|
		sequence_hash_fasta[name[0..18]]=[name, comments, seq_fasta] #Cd-hit cuts sequence's name to 20 character (even > character) so we use 'name[0..18]' like key hash
	end
	fqr.close
	return sequence_hash_fasta
end

#master_fasta(file_name) ⇒ Object



47
48
49
50
51
52
53
54
# File 'lib/full_lengther_next/cdhit.rb', line 47

def master_fasta(file_name)
	fasta=File.open(file_name,'w')
	each_cluster{|cluster|
		master=get_master(cluster)
		fasta.print '>'+master.name+' '+master.comments+"\n"+master.seq_fasta+"\n"
	}
	fasta.close
end

#master_to_sp_seqObject



56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/full_lengther_next/cdhit.rb', line 56

def master_to_sp_seq
	each_cluster{|cluster|
		master_seq = get_master(cluster)
		if master_seq.db != 'sp'
			sp_seq=get_sp(cluster)
			if !sp_seq.nil?
				cluster.map{|seq| seq.master=false}
				sp_seq.master= true
			end			
		end
	}
	
end

#parse_member(member) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
# File 'lib/full_lengther_next/cdhit.rb', line 126

def parse_member(member)
	member.gsub!('...','')
	member.gsub!('>','')
	fields = member.split(',')
	data = fields[1].split(' ',2)
	master = false
	if data[1] == '*'
		master =  true
	end			
	return data[0],master
end

#recover_different_lengths(percentage) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/full_lengther_next/cdhit.rb', line 70

def recover_different_lengths(percentage)
	seqs = []
	each_cluster{|cluster|
		master = get_master(cluster)
		cluster.each do |seq|
			if seq.name == master.name
				next
			else
				seq_mas_len = seq.seq_fasta.length/master.seq_fasta.length*100
				mas_seq_len = master.seq_fasta.length/seq.seq_fasta.length*100			
				seqs << seq if mas_seq_len < percentage && seq_mas_len < percentage
			end
		end
	}
	return seqs
end