Module: FlAnalysis

Defined in:
lib/full_lengther_next/fl_analysis.rb

Instance Method Summary collapse

Instance Method Details

#analiza_orf_y_fl(seq, hit, options, db_name) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/full_lengther_next/fl_analysis.rb', line 8

def analiza_orf_y_fl(seq, hit, options, db_name)
  query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes
  if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit
      seq_unida = UneLosHit.new(hit, query_fasta)
      full_prot =    seq_unida.full_prot 
      query_fasta =  seq_unida.output_seq # repaired fasta
      final_hit =    seq_unida.final_hit   # single hit
      $global_warnings +=  seq_unida.msgs   # warning messages
  else
    query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
    final_hit = hit.first # single hit
  end
  query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate] 

  full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate
  original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE
  seq.show_alignment(final_hit, query_fasta, show_nts) if  $verbose > 2 ## VERBOSE
  atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta)
  end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta)

  puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE
  seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if  $verbose > 1 ## VERBOSE
  puts "ATG: #{atg_status}  STOP: #{end_status}" if  $verbose > 2 ## VERBOSE

  # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus)
  type, status = determine_status(atg_status, end_status)
  status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status)
  if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25%
    save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
  end
end

#compare_seq_length_with_subject(final_prot, distance, final_hit, type, status) ⇒ Object



183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/full_lengther_next/fl_analysis.rb', line 183

def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status)
  if final_prot.length - 2 * distance > final_hit.s_len
    $global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len]
  elsif final_prot.length + 2 * distance < final_hit.s_len
    $global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
    if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len       
      if type == COMPLETE
        status = false
        $global_warnings << 'VeryShorter'
      end
    end
  end
  return status
end

#determine_status(atg_status, end_status) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/full_lengther_next/fl_analysis.rb', line 162

def determine_status(atg_status, end_status)
  if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa
    type = COMPLETE
  elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia
    type = INTERNAL
  elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina
    type = N_TERMINAL
  elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina
    type = C_TERMINAL
  end
  
  if atg_status == 'putative' || end_status == 'putative'
    status = false # Putative
  else
    status = true # Sure
  end

  return type, status
end

#exonerate_fix_frame_shift(query_fasta, hit) ⇒ Object



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/full_lengther_next/fl_analysis.rb', line 239

def exonerate_fix_frame_shift(query_fasta, hit)
  frame_shifts = []
  added_nts = 0
  hit.each_with_index do |hsp, num|
    if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes
      if !hsp.q_frameshift.empty? #There is frameshift
        hsp.q_frameshift.each do |position, num_nts|
          local_add = 3 - num_nts
          fs_final_position = position + num_nts 
          $global_warnings << ['ExFrameS', fs_final_position]
          frame_shifts << [fs_final_position, local_add]
          added_nts += local_add
        end
      end
    end
    hsp.q_beg += added_nts if num > 0
    hsp.q_end += added_nts
  end
  add = 0
  frame_shifts.each do |position, num_nts|
    query_fasta = query_fasta.insert(position+add, 'n'*num_nts)
    add += num_nts
  end
  return query_fasta
end

#find_end(final_hit, max_distance, tmp_prot, query_fasta) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/full_lengther_next/fl_analysis.rb', line 105

def find_end(final_hit, max_distance, tmp_prot, query_fasta)
  frame_shift = check_frame_shift(final_hit)
  beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot
  atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region
  end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen
  #puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3  - max_distance: #{max_distance}\e[0m"
  #puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m"
  #puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m"
  if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene
    atg_substring = tmp_prot
    end_substring = ''
    end_status = 'incomplete'
  else
    end_status = 'putative'
    putative_end = end_substring.index('*')
    end_substring = end_substring[0 .. putative_end] if putative_end
    
    s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir
    q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo 
    sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5'
    
    if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs  <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment 
      end_status = 'complete'
    elsif sq_end_distance  < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas)
      end_status = 'incomplete'
      if putative_end
        $global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs]
      else
        $global_warnings << ['DistSubj', sq_end_distance.abs]
      end
    else # tenemos suficiente secuencia
      if putative_end # tenemos un stop
        #beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop
        stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop
        if stop_q_s.abs <= max_distance #Stop codon is in search region
          end_status = 'complete'
        elsif stop_q_s < 0
          $global_warnings << 'UnexpSTOP3p'
        elsif stop_q_s > 0
          end_status = 'complete'
          $global_warnings << 'QueryTooLong'
        end
      else # no tenemos codon de parada pero tenemos suficiente secuencia
        end_status = 'incomplete'
        $global_warnings << 'ProtFusion'
      end
    end
  end
  final_prot = atg_substring + end_substring
  end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*'
  new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift
  modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if  $verbose > 1
  final_hit.q_end = new_q_end  
  return end_status, final_prot
end

#find_start(subject_start, amine_seq, distance) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/full_lengther_next/fl_analysis.rb', line 68

def find_start(subject_start, amine_seq, distance)    
  atg_status = 'putative' # complete, incomplete or putative
  stop_codon = amine_seq.rindex('*')
  if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima
    _5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq
    amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1]
    first_m = amine_seq.index('M')
    if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR
      if first_m # tenemos M 
        amine_seq = amine_seq[first_m .. amine_seq.length - 1]          
        atg_status = 'complete'
      else # con STOP pero sin M 
        $global_warnings << 'noM1'
      end
    else # esta despues, un cambio de fase impide analizar el principio
      $global_warnings << 'UnexpSTOP5p'
        amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M
    end
  else # no hay stop codon
    first_m = amine_seq.index('M')
    if first_m # tenemos M
      amine_seq = amine_seq[first_m .. amine_seq.length - 1]
      m_distance = (subject_start - amine_seq.length).abs - 10
      if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject
        $global_warnings << 'NoStopMfar'
        atg_status = 'incomplete'
      else # Tenemos M
        atg_status = 'complete'
      end
    else # sin M
      $global_warnings << 'noM2'
    end
  end
  return amine_seq, atg_status
end

#mark_nt_seqs(final_hit, query_fasta) ⇒ Object



221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/full_lengther_next/fl_analysis.rb', line 221

def mark_nt_seqs(final_hit, query_fasta)
  atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2]
  mark_atg = nil
  if atg == 'ATG'
    mark_atg = '_-_'  
  end
  stop = query_fasta[final_hit.q_end - 2..final_hit.q_end]
  mark_stop = nil
  if stop == 'TAG' || stop == 'TGA' || stop == 'TAA'
    mark_stop = '___'
  end
  seq5p = query_fasta[0..final_hit.q_beg-1]
  orf = query_fasta[final_hit.q_beg..final_hit.q_end]
  seq3p = query_fasta[final_hit.q_end..query_fasta.length]
  nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}"
  return nt_seq
end

#modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ⇒ Object

For visual report



274
275
276
277
278
279
280
281
282
# File 'lib/full_lengther_next/fl_analysis.rb', line 274

def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report
  if new_q_end > final_hit.q_end #There is an align extension
    extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate
    final_hit.q_seq = final_hit.q_seq + extend_align
  elsif new_q_end < final_hit.q_end #The align is cutted
    upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-')
    final_hit.q_seq = final_hit.q_seq[0 .. upper_limit]
  end
end

#modify_5p_align(new_q_beg, final_hit, query_fasta) ⇒ Object

For visual report



285
286
287
288
289
290
291
292
293
294
295
# File 'lib/full_lengther_next/fl_analysis.rb', line 285

def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report
  if new_q_beg < final_hit.q_beg #There is an align extension
    extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate
    final_hit.q_seq = extend_align + final_hit.q_seq
  elsif new_q_beg > final_hit.q_beg #The align is cut
    seq_cut = (new_q_beg - final_hit.q_beg)/3
    gaps = final_hit.q_seq[0..seq_cut].count('-')
    seq_cut += gaps
    final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1]
  end
end

#save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name) ⇒ Object



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/full_lengther_next/fl_analysis.rb', line 199

def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name)
  # if the sequence is Complete or it hasn't previous info will be saved
  if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE)
    seq.type = type
    seq.status = status
    seq.db_name = db_name
    seq.seq_fasta = query_fasta
    seq.seq_aa = final_prot
    seq.hit = final_hit
    seq.warnings($global_warnings)
    $global_warnings = [] # Clean all warnings for current sequence
    seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
    if type == COMPLETE
      seq.ignore = true
    end
  end
  if  $verbose > 2
    puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m"
  end
end

#set_start_codon(final_hit, distance, full_prot, query_fasta) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/full_lengther_next/fl_analysis.rb', line 41

def set_start_codon(final_hit, distance, full_prot, query_fasta)
  q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot) 
  atg_status = nil
  _5prima = q_index_start + distance

  if  final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position
    atg_status = 'complete'
    tmp_prot = full_prot[q_index_start..full_prot.length]
  elsif _5prima >= final_hit.s_beg
    amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina
    carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia
    length_before_cut = amine_seq.length
    amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein
    tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot
    new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3
    modify_5p_align(new_q_beg, final_hit, query_fasta)  if  $verbose > 1 ## VERBOSE, Modify query align
    final_hit.q_beg = new_q_beg # to get the value of the start_ORF index
  else
    $global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*')
    atg_status = 'incomplete'
    tmp_prot = full_prot
  end

  return atg_status, tmp_prot
end

#show_ntsObject

VERBOSE METHODS



267
268
269
270
271
# File 'lib/full_lengther_next/fl_analysis.rb', line 267

def show_nts
  show = false 
  show = true if $verbose && $verbose > 3
  return show
end