Class: PluginFindPolyAt

Inherits:
Plugin
  • Object
show all
Defined in:
lib/seqtrimnext/plugins/plugin_find_poly_at.rb

Instance Attribute Summary

Attributes inherited from Plugin

#stats

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Plugin

#add_plugin_stats, #add_stats, #add_text_stats, auto_setup, #can_execute?, check_param, #do_blasts, #execute, get_graph_filename, get_graph_title, graph_ignored?, ignored_graphs, #initialize, #merge_hits, #overlapX?, plot_setup, valid_graphs

Constructor Details

This class inherits a constructor from Plugin

Class Method Details

.check_params(params) ⇒ Object

Returns an array with the errors due to parameters are missing



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 353

def self.check_params(params)
  errors=[]

  comment='Minimum length of a poly-A'
default_value = 6
params.check_param(errors,'poly_a_length','Integer',default_value,comment)

  comment='Minimum percent of As in a sequence segment to be considered a poly-A'
  # default_value = 80
  default_value = 75
params.check_param(errors,'poly_a_percent','Integer',default_value,comment)


  comment='Minimum length of a poly-T'
default_value = 15
params.check_param(errors,'poly_t_length','Integer',default_value,comment)

  comment='Minimum percent of Ts in a sequence segment to be considered a poly-T'
  # default_value = 80
  default_value = 75
params.check_param(errors,'poly_t_percent','Integer',default_value,comment)

  return errors
end

Instance Method Details

#base_count(poly, poly_base) ⇒ Object



337
338
339
340
341
342
343
344
345
346
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 337

def base_count(poly,poly_base)

  # count bases en poly['found']
  s=poly['found']
  res = s.count(poly_base.downcase+poly_base.upcase)

  # puts "poly #{s} base count #{res}"

  return res
end

#base_percent(poly, poly_base) ⇒ Object



323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 323

def base_percent(poly,poly_base)

  # count Ts en poly['found']
  s=poly['found']
  ta_count = s.count(poly_base.downcase+poly_base.upcase)

  res=(ta_count.to_f/s.size.to_f)*100
  
  # puts "poly #{s} base percent #{res}"


  return res
end

#exec_seq(seq, blast_query) ⇒ Object



311
312
313
314
315
316
317
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 311

def exec_seq(seq,blast_query)
  $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for strings of polyAT's into the sequence with a length indicated by the param <poly_at_length>"

  find_polyT(seq)
  find_polyA(seq)

end

#find_polyA(seq) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 107

def find_polyA(seq)
  
  actions=[]
  polys=find_polys('AN',seq)
  poly_base = 'AN'
  type='ActionPolyA'
  
  poly_size=0
  
  # for each poly found cut it, from right to left (reverse order)
  polys.reverse_each do |poly|
    
    poly_size=poly['end'] - poly['begin'] +1
            
    # if polya is near right and is big enought and has a base percent
    
    
    
    # check if poly lenth and percent are above limits
    if (poly['end']>=seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>= @params.get_param('poly_a_length').to_i) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)

      a = seq.new_action(poly['begin'],poly['end'],type)
      a.right_action=true    #mark as rigth action to get the left insert
      
      actions.push a
      
      add_stats('poly_a_size',poly_size)
      
      # if poly a is not near right but is bigger, then cut
    elsif (poly['end']<<seq.seq_fasta.length-MAX_POLY_A_FROM_RIGHT) && (poly_size>=MIN_MIDDLE_POLY_A_SIZE) && (base_percent(poly,poly_base)>= @params.get_param('poly_a_percent').to_i)
      
      a = seq.new_action(poly['begin'],poly['end'],type)
      a.right_action=true    #mark as rigth action to get the left insert
      
      actions.push a
      
      add_stats('in_middle_poly_a_size',poly_size)
    # else
    #         puts "REJECTED: #{poly}"
      
    end
    if poly['found'].length > @params.get_param('poly_a_length').to_i
      add_stats("poly_#{poly_base}_base_percents","#{poly['found'].length}  #{base_percent(poly,poly_base)}")
    end

  end

  if !actions.empty?
    add_stats('seqs_with_polyA',1)
    seq.add_actions(actions)
    actions=[]
  end

end

#find_polys(ta, seq) ⇒ Object

Uses the param poly_at_length to look for at least that number of contiguous A’s



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 33

def find_polys(ta,seq)
  #minn = poly_at_length
  # puts "="*20 + seq.seq_name + "="*20
  
  minn = 4
  m2 = (minn/2)
  m4 = (minn/4)
  r = [-1,0,0]
  re2 = /(([#{ta}]{#{m2},})(.{0,2})([#{ta}]{#{m2},}))/i
  # re2 = /(([#{ta}]{#{m2},})(.{0,3})([#{ta}]{#{m2},}))/i

  # if ta =~/A/
  #   type='ActionPolyA'
  # else
  #   type='ActionPolyT'
  #   poly_base = 'T'
  # end

  matches = re2.global_match(seq.seq_fasta,3)

  # HASH
  polys = []

  # crear una region poly nuevo
  poly = {}
  #i=0

  matches.each do |pattern2|

    #puts pattern2.match[0]
    m_start = pattern2.match.begin(0)+pattern2.offset
    m_end = pattern2.match.end(0)+pattern2.offset-1


    # does one exist in polys with overlap?
    # yes => group it, updated end
    # no => one new

    if (e=overlap(polys,m_start,m_end))
      # puts "OVERLAPS #{e}"
      # found=seq.seq_fasta.slice(e['begin'],m_end-e['begin']+1)
      # if base_percent(poly,ta)>= 60
        e['end'] = m_end
        e['found'] = seq.seq_fasta.slice(e['begin'],e['end']-e['begin']+1)
      # else
      #   puts "Ignored because lowers the base percent of poly"
      # end
      

    else
      poly={}
      poly['begin'] = m_start
      poly['end'] = m_end #  the next pos to pattern's end
      poly['found'] = seq.seq_fasta.slice(poly['begin'],poly['end']-poly['begin']+1)
      polys.push poly
      
      # puts " NEW POLY#{ta}: #{poly}"
      
    end




  end
  
  # polys.each  do |p|
  #   puts "P#{ta}: #{p}, bp: #{base_percent(p['found'],ta)}"
  # end
  
  return polys
  
end

#find_polyT(seq) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/seqtrimnext/plugins/plugin_find_poly_at.rb', line 162

def find_polyT(seq)
  
  actions=[]
  poly_base = 'TN'
  type='ActionPolyT'
  
  polys=find_polys('TN',seq)
  
  poly_size=0
  check_for_dust=nil
  
  # for each poly found process it

  polys.each do |poly|

    poly_size=poly['end'] - poly['begin'] + 1
    # puts "#{poly}, size: #{poly['found'].length}, bcount:#{base_percent(poly,poly_base)}"
    # check if poly lenth and percent are above limits
    if (poly_size>= @params.get_param('poly_t_length').to_i) && (base_percent(poly,poly_base) >= @params.get_param('poly_t_percent').to_i)

      if (actions.empty?)  # first poly, check if polyT is on the left of sequence

        #if is polyT and is near left, then the sequence is reversed
        if (poly['begin']==0)

          seq.seq_reversed=true
          a = seq.new_action(poly['begin'],poly['end'],type)
          a.left_action=true
          actions.push a
          
          check_for_dust=poly
          
        elsif (poly['begin']<=MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_TN_COUNT)
          
          seq.seq_reversed=true
          a = seq.new_action(poly['begin'],poly['end'],type)
          a.left_action=true
          actions.push a
          
          check_for_dust=poly
        elsif (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
          
          seq.seq_reversed=true
          # seq.seq_rejected=true
          # seq.seq_rejected_by_message='unexpected polyT'
          check_for_dust=poly            
          a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
          a.left_action=true
          actions.push a
          add_stats('unexpected_poly_t_count',poly_size)
          
        end

      else # there are multiple polyTs
        
        if (poly['begin']>MAX_POLY_T_FROM_LEFT && base_count(poly,'TN')>=MIN_MIDDLE_POLY_T_SIZE)
          
          seq.seq_reversed=true
          # seq.seq_rejected=true
          # seq.seq_rejected_by_message='unexpected polyT'

          check_for_dust=poly
          
          a = seq.new_action(poly['begin'],poly['end'],'ActionUnexpectedPolyT')
          a.left_action=true
          actions.push a
          add_stats('unexpected_poly_t_count',poly_size)
          
        end
        
      end


        # if (poly['begin']<=MAX_POLY_T_FROM_LEFT*2)
        #   seq.seq_rejected=true
        #   seq.seq_rejected_by_message='polyT found'
        # end

        # @stats[:poly_t_size]={poly_size => 1}
        add_stats('poly_t_size',poly_size)
        


    end

  end

  if !actions.empty?
    add_stats('seqs_with_polyT',1)
    seq.add_actions(actions)
    
    actions=[]
    if check_for_dust && !seq.seq_fasta.nil? && !seq.seq_fasta.empty?
      dust_masker=DustMasker.new()
      dust_poly_size=check_for_dust['end']-check_for_dust['begin']+1
      found_dust = dust_masker.do_dust([">"+seq.seq_name,seq.seq_fasta])
      # puts "Checking for dust: #{seq.seq_fasta}"
      # puts found_dust.to_json
      total_dust=0
      last_dust_start=0
      
      if !found_dust.empty?
        found_dust[0].dust.each do |dust|
          start=dust[0]
          stop=dust[1]
          dust_size=dust[1]-dust[0]+1
          total_dust+=dust_size

          # dust must be big enought and be near the polyt to be a induced one
          if (dust_size)>10 && (start<last_dust_start+MAX_DUST_DISTANCE_FROM_POLYT)
            last_dust_start=stop
            a = seq.new_action(start,stop,'ActionInducedLowComplexity')
            # a.left_action=true
            actions.push a
          elsif dust_size>10
            a = seq.new_action(start,stop,'ActionLowComplexity')
            # a.left_action=true
            actions.push a
          end
        end
      end
      
      
      
      if !actions.empty?
        add_stats('poly_t_dust',dust_poly_size)
        seq.add_actions(actions)
      else
        add_stats('poly_t_no_dust',dust_poly_size)
      end
      
      # reject sequences if total dust is greater than 30
      if total_dust>30
        # if seq.seq_fasta.length<50
          seq.seq_rejected=true
          seq.seq_rejected_by_message='low complexity by polyt'
        # end
        
        add_stats('induced_low_complexity',total_dust)
      end
      
      
    end
  end

end