Top Level Namespace

Defined Under Namespace

Modules: Bio, ChimericSeqs, CommonFunctions, FlAnalysis, FlnStats, FullLengtherNext, NcRna, ScbiMapreduce Classes: Cdhit, ExoBlastHit, ExonerateResult, Mapping, Mpileup, MyWorker, MyWorkerEst, MyWorkerManagerEst, MyWorkerManagerFln, Orf, Seq, Sequence, String, TestCode, UneLosHit

Constant Summary collapse

FAILED =
-5
OTHER =
-4
UNMAPPED =
-3
CHIMERA =
-2
MISASSEMBLED =
-1
UNKNOWN =
0
COMPLETE =
1
N_TERMINAL =
2
C_TERMINAL =
3
INTERNAL =
4
NCRNA =
5
CODING =
6
OPERATION =
0
QUERY =
1
TARGET =
2

Constants included from FlnStats

FlnStats::REPORT_FOLDER

Constants included from ChimericSeqs

ChimericSeqs::BEG, ChimericSeqs::HIT, ChimericSeqs::STOP

Instance Method Summary collapse

Methods included from FlnStats

#add_percentages_by_scalar, #add_percentages_by_vector, #calculate_n50_n90, #coding_stats_reptrans, #get_taxonomy, #handle_data_main_summary, #handle_data_reptrans_summary, #initialize_stats_hash, #initialize_stats_hash_reptrans, #last_stats, #sequence_stats, #summary_stats, #table_title, #write_mapping_report, #write_reptrans_stats, #write_summary_stats, #write_txt

Methods included from CommonFunctions

#check_frame_shift, #contenidos_en_prot, #corrige_frame, #extend_match, #match_with_gapped_reference, #match_with_ungapped_reference, #refine_match, #reverse_seq, #scan_sequences

Methods included from NcRna

#find_nc_rna

Methods included from FlAnalysis

#analiza_orf_y_fl, #compare_seq_length_with_subject, #determine_status, #exonerate_fix_frame_shift, #find_end, #find_start, #mark_nt_seqs, #modify_3p_align, #modify_5p_align, #save_annotations, #set_start_codon, #show_nts

Methods included from ChimericSeqs

#cluster_query_hits, #confirm_chimeras, #define_hit_limits, #define_homology_zones, #delete_zones, #do_clustal, #duplicate_hits, #fragment_chimera, #get_hits, #get_limits, #get_limits_s, #hit_is_in?, #hit_move_limits, #hit_set_q_len, #min_distance_between_homology_zones, #move_limits, #overlapping_zones, #search_chimeras, #search_ident_prots, #set_cut_positions, #set_limits

Instance Method Details

#analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/full_lengther_next/reptrans.rb', line 67

def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
	clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
	representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
	if pfam_clustering
		clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
		representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
	end
	stats_hash[key_stats] += representative_seqs_annot_DB.length
	report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
	write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
end

#artifact?(seq, query, db_name, db_path, options, new_seqs) ⇒ Boolean

MAIN FUNCTION

Returns:

  • (Boolean)


9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/full_lengther_next/artifacts.rb', line 9

def artifact?(seq, query, db_name, db_path, options, new_seqs)
	artifact = false
	# UNMAPPED CONTIG DETECTION
	if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
		seq.hit = nil
		artifact = true
		seq.type = UNMAPPED
	end

	if !query.nil?
		# MISASSEMBLED DETECTION
		if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
			seq.hit = query.hits.first
			artifact = true
			seq.type = MISASSEMBLED
			seq.warnings('ERROR#1')
		end

		# OVERLAPPING HSPS ON SUBJECT DETECTION
=begin
		if !artifact
			hit_reference = query.hits.first.dup
			query, overlapping = overlapping_hsps_on_subject(query)
			if overlapping
				if query.hits.first.nil?
					seq.hit = hit_reference
				else
					seq.hit = query.hits.first
				end
				artifact = true
				seq.type = OTHER
				seq.warnings('ERROR#2')
			end
		end
=end

		# MULTIPLE HSP DETECTION
		if !artifact && multiple_hsps(query, 3)   
			seq.hit = query.hits.first
			seq.warnings('ERROR#3')
		end

		# CHIMERA DETECTION
		if !artifact && !options[:chimera].include?('d')  
			chimera = search_chimeras(seq, query, options, db_name, db_path)			
			if !chimera.nil?   
				new_seqs.concat(chimera)
				seq.db_name = db_name
				seq.type = CHIMERA
				artifact = true
			end
		end
	end
	if artifact
		if $verbose > 1
			puts seq.prot_annot_calification
		end
		seq.db_name = db_name
		seq.save_fasta = false
		seq.ignore = true
	end
	return artifact
end

#clean_by_identity(blast_result, ident) ⇒ Object



137
138
139
140
141
142
143
144
145
146
# File 'lib/full_lengther_next/blast_functions.rb', line 137

def clean_by_identity(blast_result, ident)
	blast_result.querys.each do |query|
		if !query.hits.first.nil?
			new_hits = query.hits.select{|hit| hit.ident > ident}	
			new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
			query.hits = new_hits
		end
		query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer
	end
end

#clean_by_query_length_match(blast_result, min_len_nt) ⇒ Object



148
149
150
151
152
153
154
155
156
157
158
# File 'lib/full_lengther_next/blast_functions.rb', line 148

def clean_by_query_length_match(blast_result, min_len_nt)
	blast_result.querys.each do |query|
		if !query.hits.first.nil?
			new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt}	
			new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil]
			query.hits = new_hits
		end
		query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer

	end
end

#clean_hsp_by_identity(hit, identity) ⇒ Object



298
299
300
301
# File 'lib/full_lengther_next/blast_functions.rb', line 298

def clean_hsp_by_identity(hit, identity)
	hit.select!{|hsp| hsp.ident >= identity}
	return hit
end

#clean_overlapping_hsps(blast_result, keep_if_diff_sense = false) ⇒ Object



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/full_lengther_next/blast_functions.rb', line 161

def clean_overlapping_hsps(blast_result, keep_if_diff_sense = false)
	blast_result.querys.each do |query|
		if query.hits.length > 1
			query.hits.each_with_index do |hit, j|
				if hit.nil?
					next
				end
				query.hits.each_with_index do |second_hit, i|
					if second_hit.nil? || i == j #Same hit
						next
					end
					if same_query_hsp(hit, second_hit) #|| same_subject_hsp(hit, second_hit)
						if keep_if_diff_sense 
							if same_sense?(hit, second_hit) #Delete second_hit if is into the hit and has same sense 
								query.hits[i] = nil
							end
						else
							query.hits[i] = nil
						end
					end
				end
			end
			query.hits.compact!
		end
	end
end

#clean_subject_overlapping_hsps(complete_hit, cleaned_hits) ⇒ Object

COMPLEMENTARY FUNCTIONS



246
247
248
249
250
251
252
# File 'lib/full_lengther_next/blast_functions.rb', line 246

def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
	if complete_hit.length > 1
		complete_hit, overlapping = subject_overlapping_hsps(complete_hit)
	end
	cleaned_hits.concat(complete_hit)
	return complete_hit, overlapping
end

#cluster_hsps(hsps) ⇒ Object



303
304
305
306
307
308
309
310
311
312
313
314
315
# File 'lib/full_lengther_next/blast_functions.rb', line 303

def cluster_hsps(hsps)
	hits = []
	last_acc = ''
	hsps.each do |hsp|
		if hsp.acc != last_acc
			hits << [hsp]
		else
			hits.last << hsp
		end
		last_acc = hsp.acc
	end
	return hits
end

#clustering_by_annot(seqs_with_hit, annotation_type) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/full_lengther_next/reptrans.rb', line 165

def clustering_by_annot(seqs_with_hit, annotation_type)
	clusters = []
	annot_id = []
	no_annotation_clusters = []
	seqs_with_hit.each do |seq|
		annot = seq.functional_annotations[annotation_type]
		annot = annot.split(';').sort.join(';') if !annot.nil?
		if annot == '-' || annot.nil?
			no_annotation_clusters << [seq]
		else
			position = annot_id.index(annot)
			if position.nil?
				annot_id << annot
				clusters << [seq]
			else
				clusters[position] << seq 
			end
		end
	end
	clusters.concat(no_annotation_clusters)
	return clusters
end

#clustering_by_id(seqs_with_hit) ⇒ Object



150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/full_lengther_next/reptrans.rb', line 150

def clustering_by_id(seqs_with_hit)
	clusters=[]
	hit_id=[]
	seqs_with_hit.each do |seq|
		position=hit_id.index(seq.get_acc)
		if position.nil?
			hit_id << seq.get_acc
			clusters << [seq]
		else
			clusters[position] << seq 
		end
	end	
	return clusters
end

#count_cpu(options) ⇒ Object



206
207
208
209
210
211
212
213
214
# File 'lib/full_lengther_next/reptrans.rb', line 206

def count_cpu(options)
	cpu = 0
	if options[:workers].class.to_s == 'Array'
		cpu = options[:workers].length + 1
	else
		cpu = options[:workers]
	end
	return cpu
end

#do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) ⇒ Object

Second server to representative transcriptome



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/full_lengther_next/reptrans.rb', line 101

def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
	$LOG.info 'Starting server for EST analysis'
		custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
		options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
	
		MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
		server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
		server_EST.chunk_size = options[:chunk_size]
		server_EST.start_server
	$LOG.info 'Closing server for EST analysis'

	seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
	if !seqs_with_EST.empty?
		analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated') 
	end
	return putative_seqs
end

#do_makeblastdb(seqs, output, dbtype) ⇒ Object



35
36
37
38
39
40
41
42
43
44
# File 'lib/full_lengther_next/handle_db.rb', line 35

def do_makeblastdb(seqs, output, dbtype)
	cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
	IO.popen(cmd,'w+') {|makedb|
		makedb.sync = true
		makedb.write(seqs)
		makedb.close_write
		puts makedb.readlines
		makedb.close_read
	}
end

#filter_hits(query, select_hits = 10) ⇒ Object

Select best hits



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/full_lengther_next/blast_functions.rb', line 4

def filter_hits(query, select_hits=10) # Select best hits
	hits = query.hits
	if !hits.first.nil?
		hits = cluster_hsps(hits)
		hits = hits[0..select_hits]
		hits = select_hits_by_identity_query(hits)
		hits = select_hits_by_coverage_subject(hits)
	end
	if hits.empty? 
		if select_hits >= query.hits.length || select_hits >= 100 # Condition to stop a infinite recursive function
			hits = [cluster_hsps(query.hits).first]
		else
			hits = filter_hits(query, select_hits+10)
		end
	end
	return hits
end

#find_hit(hit_acc, ar_hits) ⇒ Object



317
318
319
320
321
322
323
324
325
326
# File 'lib/full_lengther_next/blast_functions.rb', line 317

def find_hit(hit_acc, ar_hits)
	selected_hit = nil
	ar_hits.each do |hit|
		if hit.first.acc == hit_acc
			selected_hit = hit
			break
		end
	end
	return selected_hit
end

#get_coverage_subject(hit) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
# File 'lib/full_lengther_next/blast_functions.rb', line 22

def get_coverage_subject(hit)
	perc_identity = hit.align_len*100.0/hit.s_len
	if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty?
		hit.q_frameshift.length.times do |n| #Align len correction by frameshift. FS can create a gap in alignment adding extra aa. FS can be deletions or insertions so we check until get a perc_identity of 100
			align_len = hit.align_len- (n + 1)
			perc_identity = align_len*100.0/hit.s_len
			break if perc_identity <= 100
		end
	end 
	return perc_identity
end

#go_for_graph(sequences_by_ontologies, fpkm = {}) ⇒ Object



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/full_lengther_next/go_methods.rb', line 1

def go_for_graph(sequences_by_ontologies, fpkm = {})
	container = {}
	go_data = [
		[:function_go, 'F:'],
		[:component_go, 'C:'],
		[:process_go, 'P:']
	]

	go_data.each do |key, prefix|		
		go_ontology = sequences_by_ontologies.select{|go, seq_ids| go =~ /^#{prefix}/}
		go_names = [] 
		go_vals = []
		go_ontology.each do |go_name, seq_names|
			go_label = go_name.gsub(prefix, '')
			if fpkm.empty?
				go_vals << seq_names.length
				go_names << go_label
			else
				sum = seq_names.map{|seq_name| fpkm[seq_name].first }.inject { |sum, n| sum + n }
				if sum > 0
					go_vals << sum 
					go_names << go_label
				end
			end
		end
		go_table = []
		go_names.each_with_index do |name, index|
			go_table << [name, go_vals[index]]
		end
		go_table.sort!{|v1, v2| v2[1] <=> v1[1]}
		go_table.unshift([key.to_s, 'GO'])
		if !go_names.empty?
			container[key] = go_table 
		else
			container[key] = [
				[key.to_s, 'GO'],
				['No_data', 1]
			] 
		end
	end
	return container
end

#hsps_relationship_subject(hit) ⇒ Object



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/full_lengther_next/blast_functions.rb', line 271

def hsps_relationship_subject(hit)
	hsps = []
	hit.each_with_index do |hsp, j|
		hit.each_with_index do |second_hsp, i|
			if i == j #Same hit
				next
			end
			if same_subject_hsp(hsp, second_hsp)
				if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) # Save if no exists direct relationship or his inverse
					hsps << [hsp, second_hsp]
				end
			end
		end
	end
	return hsps
end

#load_cd_hit_sequences_names(file) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
# File 'lib/full_lengther_next/reptrans.rb', line 120

def load_cd_hit_sequences_names(file)
	names=[]
	File.open(file).readlines.each do |line|
		if line =~ /^>/
			line.chomp!
			line.gsub!('>','')
			names << line
		end	
	end
	return names
end

#load_isoform_hash(file) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/full_lengther_next/handle_db.rb', line 3

def load_isoform_hash(file)
	isoform_hash = {}
	if File.exists?(file)
		fasta = ScbiZcatFile.new(file)
	    filtered_fasta = ''
		seq_name = nil
		seq = ''
		while !fasta.eof
		        line = fasta.readline.chomp
		        if line[0] == '>'		                
						load_seq_in_hash(seq_name, seq, isoform_hash) if !seq_name.nil?
		                seq_name = line
		                seq = ''
		        else
		                seq << line
		        end
		end
		load_seq_in_hash(seq_name, seq, isoform_hash)
	end
	return isoform_hash
end

#load_seq_in_hash(seq_name, seq, isoform_hash) ⇒ Object



25
26
27
28
29
30
31
32
33
# File 'lib/full_lengther_next/handle_db.rb', line 25

def load_seq_in_hash(seq_name, seq, isoform_hash)
	name, desc = seq_name.split(' ', 2) 
	name =~ /(\w+\|(\w+)\-\d+\|)/
	if isoform_hash[$2].nil?
		isoform_hash[$2] = ">#{$1}#{desc}\n#{seq}"
	else
		isoform_hash[$2] += "\n>#{$1}#{desc}\n#{seq}"
	end
end

#misassembled_detection(query) ⇒ Object

DETECTION FUNCTIONS



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/full_lengther_next/blast_functions.rb', line 192

def misassembled_detection(query)
	miss=false
	hits = cluster_hsps(query.hits)
	misassembled_hits = []
	hits.each do |hit|
		if hit.length > 1
			negative_frame = hit.select{|hsp| hsp.q_frame < 0}
			if negative_frame.length > 0 && negative_frame.length != hit.length
				misassembled_hits << hit.first.acc
			end
		end
	end
	if misassembled_hits.length*1.0/ hits.length > 0.5
		miss = true
	else #Remove missassembled hits to avoid broken analysis
		query.hits.reverse_each do |hsp|
			if misassembled_hits.include?(hsp.acc)
				query.hits.delete(hsp)
			end
		end
	end
	return miss
end

#multiple_hsps(query, num) ⇒ Object



216
217
218
219
220
221
222
223
# File 'lib/full_lengther_next/blast_functions.rb', line 216

def multiple_hsps(query, num)
	multiple = false
	hsps = query.hits.select{|h| h.acc == query.hits.first.acc}	
	if hsps.length >= num
		multiple = true
	end
	return multiple
end

#overlapping_hsps_on_subject(query) ⇒ Object



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/full_lengther_next/blast_functions.rb', line 225

def overlapping_hsps_on_subject(query)
	overlapping = false
	current_hit = query.hits.first.acc
	complete_hit = []
	cleaned_hits = []
	query.hits.each do |hit|
		if hit.acc != current_hit
			complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
			complete_hit = []
		end
		complete_hit << hit
		current_hit = hit.acc
	end
	complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
	query.hits = cleaned_hits
	return query, overlapping
end

#reduce_pool_sequences(putative_seqs, main_path, cpu) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/full_lengther_next/reptrans.rb', line 88

def reduce_pool_sequences(putative_seqs, main_path, cpu)
	temp_fasta = File.join(main_path, 'temp.fasta')
	temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
	log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
	write_fasta(putative_seqs, temp_fasta, 'w')
	$LOG.info "Start cd-hit with coding and unknow sequences"	
	system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
	$LOG.info "Ended cd-hit with coding and unknow sequences"	
	cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
	putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
	return putative_seqs
end

#report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB) ⇒ Object



79
80
81
82
83
84
85
86
# File 'lib/full_lengther_next/reptrans.rb', line 79

def	report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
	cluster_file = File.open(cluster_file_path, 'w')
	representative_seqs_annot_DB.each_with_index do |rep_seq, i|
		cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
		cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
	end
	cluster_file.close
end

#reptrans(seqs_annotation_prot, seqs_some_coding, seqs_unknown, options) ⇒ Object

MAIN FUNCTION



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/full_lengther_next/reptrans.rb', line 9

def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
	cpus = count_cpu(options)
	stats_hash = initialize_stats_hash_reptrans
	# Paths
	#---------------------------------------------	
	main_path = File.join(Dir.pwd, 'fln_results')
	reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
	blast_path = File.join(main_path, 'ESTdb')
	cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
	cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
	html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
	txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')

	# Prot annotations sequence analysis
	#---------------------------------------------
	analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
	seqs_annotation_prot = nil

	# NOT Prot annotations sequence analysis
	#---------------------------------------------
	putative_seqs = seqs_some_coding	
	if !options[:est_db].nil? # WITH EST DATABASE
		putative_seqs += seqs_unknown # Coding & unknown 
		putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
		if !File.exists?(blast_path +'.nsq')
			$LOG.info "Start makeblastdb over EST DB"
			system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
			$LOG.info "Ended makeblastdb over EST DB"
		end
		putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
	end

	# Coding sequence analysis
	#---------------------------------------------
	if !putative_seqs.nil? && !putative_seqs.empty?
		putative_seqs = select_seqs_more_500pb(putative_seqs)
		putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
		putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
			if s2.t_code == s1.t_code
				s2.fasta_length <=> s1.fasta_length
			else
				s2.t_code <=> s1.t_code
			end
		}
		count = 0
		putative_seqs.each do |coding_seq|
			coding_stats_reptrans(coding_seq, stats_hash)
			count +=1
		end
 
		write_fasta(putative_seqs, reptrans_fasta, 'a')
	end
	write_reptrans_stats(stats_hash, html_file, txt_file)
end

#same_query_hsp(hit, second_hit) ⇒ Object



117
118
119
120
121
122
123
124
125
# File 'lib/full_lengther_next/blast_functions.rb', line 117

def same_query_hsp(hit, second_hit)
	same = false
	if hit.acc == second_hit.acc
		if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
			same = true
		end
	end
	return same		
end

#same_sense?(hit, second_hit) ⇒ Boolean

Returns:

  • (Boolean)


127
128
129
130
131
132
133
134
135
# File 'lib/full_lengther_next/blast_functions.rb', line 127

def same_sense?(hit, second_hit)
	same= false
	hit_sense = hit.q_frame <=> 0
	second_hit_sense = second_hit.q_frame <=> 0
	if hit_sense == second_hit_sense
		same = true
	end
	return same
end

#same_subject_hsp(hit, second_hit) ⇒ Object



107
108
109
110
111
112
113
114
115
# File 'lib/full_lengther_next/blast_functions.rb', line 107

def same_subject_hsp(hit, second_hit)
	same = false
	if hit.acc == second_hit.acc
		if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
			same = true
		end
	end
	return same
end

#select_hits_by_coverage_subject(hits) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/full_lengther_next/blast_functions.rb', line 34

def select_hits_by_coverage_subject(hits)
	selected_hits = []
	coverage_thresold = get_coverage_subject(hits.first.first)
	coverage_thresold = 100 if coverage_thresold > 100

	hits.map{|hit|
		hit.each do |hsp|
			coverage = get_coverage_subject(hsp)
			if coverage > 100
				next
			end
			if  coverage >= coverage_thresold
				selected_hits << hit
				break
			end
		end	
	}
	return selected_hits
end

#select_hits_by_evalue(hits, evalue) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
# File 'lib/full_lengther_next/blast_functions.rb', line 68

def select_hits_by_evalue(hits, evalue)
	selected_hits = []
	hits.map{|hit|
		hit.each do |hsp|
			if hsp.e_val <= evalue 
				selected_hits << hit
			end
		end
	}
	return selected_hits
end

#select_hits_by_identity_query(hits) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/full_lengther_next/blast_functions.rb', line 54

def select_hits_by_identity_query(hits)
	selected_hits = []
	identity = hits.first.first.ident
	hits.map{|hit|
		hit.each do |hsp|
			if hsp.ident >= identity
				selected_hits << hit
				break
			end
		end
	}
	return selected_hits
end

#select_hsps_by_id(hits, selected_ids) ⇒ Object



80
81
82
83
84
85
86
87
88
# File 'lib/full_lengther_next/blast_functions.rb', line 80

def select_hsps_by_id(hits, selected_ids)
	selected_hits = []
	hits.map{|hsp|
		if selected_ids.include?(hsp.acc)
			selected_hits << hsp
		end
	}
	return selected_hits
end

#select_representative(clusters_seqs_annot_prot) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/full_lengther_next/reptrans.rb', line 188

def select_representative(clusters_seqs_annot_prot)
	seqs = []	
	clusters_seqs_annot_prot.each do |cluster|
		if !cluster.first.coverage_analysis.empty? # filtering by mapping coverage
			max_transcript_mean_coverage = cluster.map{|seq| seq.coverage_analysis[3] }.max - 0.05 # Relaxed limit of 5%
			cluster.select!{|seq| seq.coverage_analysis[3] >= max_transcript_mean_coverage}
		end
		seq = cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
		if seq.nil?
			cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
			best_pident = cluster.first.get_pident
			seq = cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
		end
		seqs << seq
	end
	return seqs
end

#select_seqs_more_500pb(seqs_array) ⇒ Object



132
133
134
135
# File 'lib/full_lengther_next/reptrans.rb', line 132

def select_seqs_more_500pb(seqs_array)
	seqs = seqs_array.select{|seq| seq.fasta_length > 500 }	
	return seqs
end

#select_seqs_with_name(array_seqs, array_names) ⇒ Object



137
138
139
140
# File 'lib/full_lengther_next/reptrans.rb', line 137

def select_seqs_with_name(array_seqs, array_names)
	seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
	return seqs
end

#set_thresold_evalue(hits) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/full_lengther_next/blast_functions.rb', line 90

def set_thresold_evalue(hits)
	evalue = 100
	hits.map{|hit|
		if hit.e_val != 0 && hit.e_val < evalue
			evalue = hit.e_val
		end
	}
	if evalue == 100
		evalue = 0
	else
		exp = Math.log10(evalue).abs.to_i
		min_exp = (exp/10.0).ceil
		evalue = 10.0**-(exp-min_exp)
	end
	return evalue
end

#subject_overlapping_hsps(hit) ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# File 'lib/full_lengther_next/blast_functions.rb', line 254

def subject_overlapping_hsps(hit)
	overlapping = false
	hsp_table = hsps_relationship_subject(hit)
	if !hsp_table.empty?
		hit = clean_hsp_by_identity(hit, 55)
		if hit.empty?
			overlapping = true
		else 
			hsp_table = hsps_relationship_subject(hit)
			if !hsp_table.empty?
				overlapping = true
			end	
		end
	end
	return hit, overlapping
end

#write_fasta(seqs_array, file_name, mode) ⇒ Object



142
143
144
145
146
147
148
# File 'lib/full_lengther_next/reptrans.rb', line 142

def write_fasta(seqs_array, file_name, mode)
	file=File.open(file_name, mode)
	seqs_array.each do |seq|
		file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
	end
	file.close
end