Module: FlAnalysis

Defined in:
lib/full_lengther_next/fl_analysis.rb

Instance Method Summary collapse

Instance Method Details

#analiza_orf_y_fl(seq, hit, options, db_name) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/full_lengther_next/fl_analysis.rb', line 8

def analiza_orf_y_fl(seq, hit, options, db_name)
	query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
	if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
			seq_unida = UneLosHit.new(hit, query_fasta)
			full_prot =		seq_unida.full_prot	
			query_fasta =	seq_unida.output_seq	# repaired fasta
			final_hit =		seq_unida.final_hit		# single hit
			$global_warnings +=	seq_unida.msgs		# warning messages
	else
		query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
		final_hit = hit.first # single hit
	end
	query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate]	

	full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
	original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
	seq.show_alignment(final_hit, query_fasta, show_nts) if  $verbose > 2 ## VERBOSE
	atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
	end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)

	puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
	seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if  $verbose > 1 ## VERBOSE
	puts "ATG: #{atg_status}  STOP: #{end_status}" if  $verbose > 2 ## VERBOSE

	# decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
	type, status = determine_status(atg_status, end_status)
	status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
	if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
		save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
	end
end

#compare_seq_length_with_subject(final_prot, distance, final_hit, type, status) ⇒ Object



183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/full_lengther_next/fl_analysis.rb', line 183

def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
	if final_prot.length - 2 * distance > final_hit.s_len
		$global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
	elsif final_prot.length + 2 * distance < final_hit.s_len
		$global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
		if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len				
			if type == COMPLETE
				status = false
				$global_warnings << 'VeryShorter'
			end
		end
	end
	return status
end

#determine_status(atg_status, end_status) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/full_lengther_next/fl_analysis.rb', line 162

def determine_status(atg_status, end_status)
	if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
		type = COMPLETE
	elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
		type = INTERNAL
	elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
		type = N_TERMINAL
	elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
		type = C_TERMINAL
	end
	
	if atg_status == 'putative' || end_status == 'putative'
		status = false # Putative
	else
		status = true # Sure
	end

	return type, status
end

#exonerate_fix_frame_shift(query_fasta, hit) ⇒ Object



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/full_lengther_next/fl_analysis.rb', line 239

def exonerate_fix_frame_shift(query_fasta, hit)
	frame_shifts = []
	added_nts = 0
	hit.each_with_index do |hsp, num|
		if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
			if !hsp.q_frameshift.empty? #There is frameshift
				hsp.q_frameshift.each do |position, num_nts|
					local_add = 3 - num_nts
					fs_final_position = position + num_nts 
					$global_warnings << ['ExFrameS', fs_final_position]
					frame_shifts << [fs_final_position, local_add]
					added_nts += local_add
				end
			end
		end
		hsp.q_beg += added_nts if num > 0
		hsp.q_end += added_nts
	end
	add = 0
	frame_shifts.each do |position, num_nts|
		query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
		add += num_nts
	end
	return query_fasta
end

#find_end(final_hit, max_distance, tmp_prot, query_fasta) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/full_lengther_next/fl_analysis.rb', line 105

def find_end(final_hit, max_distance, tmp_prot, query_fasta)
	frame_shift = check_frame_shift(final_hit)
	beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
	atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
	end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
	#puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3  - max_distance: #{max_distance}\e[0m"
	#puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
	#puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
	if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
		atg_substring = tmp_prot
		end_substring = ''
		end_status = 'incomplete'
	else
		end_status = 'putative'
		putative_end = end_substring.index('*')
		end_substring = end_substring[0 .. putative_end] if putative_end
		
		s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
		q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo	
		sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
		
		if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs  <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment 
			end_status = 'complete'
		elsif sq_end_distance  < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
			end_status = 'incomplete'
			if putative_end
				$global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
			else
				$global_warnings << ['DistSubj', sq_end_distance.abs]
			end
		else # tenemos suficiente secuencia
			if putative_end # tenemos un stop
				#beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
				stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
				if stop_q_s.abs <= max_distance #Stop codon is in search region
					end_status = 'complete'
				elsif stop_q_s < 0
					$global_warnings << 'UnexpSTOP3p'
				elsif stop_q_s > 0
					end_status = 'complete'
					$global_warnings << 'QueryTooLong'
				end
			else # no tenemos codon de parada pero tenemos suficiente secuencia
				end_status = 'incomplete'
				$global_warnings << 'ProtFusion'
			end
		end
	end
	final_prot = atg_substring + end_substring
	end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
	new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
	modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if  $verbose > 1
	final_hit.q_end = new_q_end  
	return end_status, final_prot
end

#find_start(subject_start, amine_seq, distance) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/full_lengther_next/fl_analysis.rb', line 68

def find_start(subject_start, amine_seq, distance)		
	atg_status = 'putative' # complete, incomplete or putative
	stop_codon = amine_seq.rindex('*')
	if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
		_5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
		amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
		first_m = amine_seq.index('M')
		if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
			if first_m # tenemos M 
				amine_seq = amine_seq[first_m .. amine_seq.length - 1]					
				atg_status = 'complete'
			else # con STOP pero sin M 
				$global_warnings << 'noM1'
			end
		else # esta despues, un cambio de fase impide analizar el principio
			$global_warnings << 'UnexpSTOP5p'
				amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
		end
	else # no hay stop codon
		first_m = amine_seq.index('M')
		if first_m # tenemos M
			amine_seq = amine_seq[first_m .. amine_seq.length - 1]
			m_distance = (subject_start - amine_seq.length).abs - 10
			if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
				$global_warnings << 'NoStopMfar'
				atg_status = 'incomplete'
			else # Tenemos M
				atg_status = 'complete'
			end
		else # sin M
			$global_warnings << 'noM2'
		end
	end
	return amine_seq, atg_status
end

#mark_nt_seqs(final_hit, query_fasta) ⇒ Object



221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/full_lengther_next/fl_analysis.rb', line 221

def mark_nt_seqs(final_hit, query_fasta)
	atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
	mark_atg = nil
	if atg == 'ATG'
		mark_atg = '_-_'	
	end
	stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
	mark_stop = nil
	if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
		mark_stop = '___'
	end
	seq5p = query_fasta[0..final_hit.q_beg-1]
	orf = query_fasta[final_hit.q_beg..final_hit.q_end]
	seq3p = query_fasta[final_hit.q_end..query_fasta.length]
	nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
	return nt_seq
end

#modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ⇒ Object

For visual report



274
275
276
277
278
279
280
281
282
# File 'lib/full_lengther_next/fl_analysis.rb', line 274

def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
	if new_q_end > final_hit.q_end #There is an align extension
		extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
		final_hit.q_seq = final_hit.q_seq + extend_align
	elsif new_q_end < final_hit.q_end #The align is cutted
		upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
		final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
	end
end

#modify_5p_align(new_q_beg, final_hit, query_fasta) ⇒ Object

For visual report



285
286
287
288
289
290
291
292
293
294
295
# File 'lib/full_lengther_next/fl_analysis.rb', line 285

def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
	if new_q_beg < final_hit.q_beg #There is an align extension
		extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
		final_hit.q_seq = extend_align + final_hit.q_seq
	elsif new_q_beg > final_hit.q_beg #The align is cut
		seq_cut = (new_q_beg - final_hit.q_beg)/3
		gaps = final_hit.q_seq[0..seq_cut].count('-')
		seq_cut += gaps
		final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
	end
end

#save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name) ⇒ Object



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/full_lengther_next/fl_analysis.rb', line 199

def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
	# if the sequence is Complete or it hasn't previous info will be saved
	if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
		seq.type = type
		seq.status = status
		seq.db_name = db_name
		seq.seq_fasta = query_fasta
		seq.seq_aa = final_prot
		seq.hit = final_hit
		seq.warnings($global_warnings)
		$global_warnings = [] # Clean all warnings for current sequence
		seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
		if type == COMPLETE
			seq.ignore = true
		end
	end
	if  $verbose > 2
		puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
	end
end

#set_start_codon(final_hit, distance, full_prot, query_fasta) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/full_lengther_next/fl_analysis.rb', line 41

def set_start_codon(final_hit, distance, full_prot, query_fasta)
	q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot) 
	atg_status = nil
	_5prima = q_index_start + distance

	if  final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
		atg_status = 'complete'
		tmp_prot = full_prot[q_index_start..full_prot.length]
	elsif _5prima >= final_hit.s_beg
		amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
		carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
		length_before_cut = amine_seq.length
		amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
		tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
		new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
		modify_5p_align(new_q_beg, final_hit, query_fasta)	if  $verbose > 1 ## VERBOSE, Modify query align
		final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
	else
		$global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
		atg_status = 'incomplete'
		tmp_prot = full_prot
	end

	return atg_status, tmp_prot
end

#show_ntsObject

VERBOSE METHODS



267
268
269
270
271
# File 'lib/full_lengther_next/fl_analysis.rb', line 267

def show_nts
	show = false	
	show = true if $verbose && $verbose > 3
	return show
end