Class: GeneValidator::DuplicationValidation

Inherits:
ValidationTest show all
Extended by:
Forwardable
Defined in:
lib/genevalidator/validation_duplication.rb

Overview

This class contains the methods necessary for finding duplicated subsequences in the predicted gene

Instance Attribute Summary collapse

Attributes inherited from ValidationTest

#cli_name, #description, #header, #hits, #prediction, #run_time, #short_header, #type, #validation_report

Instance Method Summary collapse

Constructor Details

#initialize(prediction, hits) ⇒ DuplicationValidation

Returns a new instance of DuplicationValidation.



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/genevalidator/validation_duplication.rb', line 88

def initialize(prediction, hits)
  super
  @short_header      = 'Duplication'
  @header            = 'Duplication'
  @description       = 'Check whether there is a duplicated subsequence' \
                       ' in the predicted gene by counting the hsp' \
                       ' residue coverage of the prediction, for each hit.'
  @cli_name          = 'dup'
  @raw_seq_file      = opt[:raw_sequences]
  @index_file_name   = config[:raw_seq_file_index]
  @raw_seq_file_load = config[:raw_seq_file_load]
  @db                = opt[:db]
  @num_threads       = opt[:num_threads]
  @type              = config[:type]
end

Instance Attribute Details

#index_file_nameObject (readonly)

Returns the value of attribute index_file_name.



85
86
87
# File 'lib/genevalidator/validation_duplication.rb', line 85

def index_file_name
  @index_file_name
end

#raw_seq_fileObject (readonly)

Returns the value of attribute raw_seq_file.



84
85
86
# File 'lib/genevalidator/validation_duplication.rb', line 84

def raw_seq_file
  @raw_seq_file
end

#raw_seq_file_loadObject (readonly)

Returns the value of attribute raw_seq_file_load.



86
87
88
# File 'lib/genevalidator/validation_duplication.rb', line 86

def raw_seq_file_load
  @raw_seq_file_load
end

Instance Method Details

#in_range?(ranges, idx) ⇒ Boolean

Returns:

  • (Boolean)


104
105
106
107
108
109
# File 'lib/genevalidator/validation_duplication.rb', line 104

def in_range?(ranges, idx)
  ranges.each do |range|
    return (range.member?(idx)) ? true : false
  end
  false
end

#run(n = 10) ⇒ Object

Check duplication in the first n hits Output: DuplicationValidationOutput object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/genevalidator/validation_duplication.rb', line 115

def run(n = 10)
  fail NotEnoughHitsError unless hits.length >= 5
  fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
              hits[0].is_a?(Query)

  start = Time.new
  # get the first n hits
  less_hits = @hits[0..[n - 1, @hits.length].min]
  useless_hits = []
  # get raw sequences for less_hits
  less_hits.map do |hit|
    next unless hit.raw_sequence.nil?
    hit.raw_sequence = FetchRawSequences.run(hit.identifier,
                                             hit.accession_no)
    useless_hits.push(hit) if hit.raw_sequence.nil?
  end

  useless_hits.each { |hit| less_hits.delete(hit) }

  fail NoInternetError if less_hits.length == 0

  averages = []

  less_hits.each do |hit|
    coverage = Array.new(hit.length_protein, 0)
    # each residue of the protein should be evluated once only
    ranges_prediction = []

    hit.hsp_list.each do |hsp|
      # align subsequences from the hit and prediction that match
      if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
        hit_alignment   = hsp.hit_alignment
        query_alignment = hsp.query_alignment
      else
        # indexing in blast starts from 1
        hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
        query_local = prediction.raw_sequence[hsp.match_query_from -
                                              1..hsp.match_query_to - 1]

        # in case of nucleotide prediction sequence translate into protein
        # use translate with reading frame 1 because
        # to/from coordinates of the hsp already correspond to the
        # reading frame in which the prediction was read to match this hsp
        if @type == :nucleotide
          s = Bio::Sequence::NA.new(query_local)
          query_local = s.translate
        end

        # local alignment for hit and query
        seqs = [hit_local, query_local]

        begin
          options   = ['--maxiterate', '1000', '--localpair', '--anysymbol',
                       '--quiet', '--thread', "#{@num_threads}"]
          mafft     = Bio::MAFFT.new('mafft', options)

          report    = mafft.query_align(seqs)
          raw_align = report.alignment
          align     = []

          raw_align.each { |seq| align.push(seq.to_s) }
          hit_alignment   = align[0]
          query_alignment = align[1]
        rescue
          raise NoMafftInstallationError
        end
      end

      # check multiple coverage

      # for each hsp of the curent hit
      # iterate through the alignment and count the matching residues
      [*(0..hit_alignment.length - 1)].each do |i|
        residue_hit   = hit_alignment[i]
        residue_query = query_alignment[i]
        next if residue_hit == ' ' || residue_hit == '+' ||
                residue_hit == '-' || residue_hit != residue_query
        # indexing in blast starts from 1
        idx_hit   = i + (hsp.hit_from - 1) -
                    hit_alignment[0..i].scan(/-/).length
        idx_query = i + (hsp.match_query_from - 1) -
                    query_alignment[0..i].scan(/-/).length
        unless in_range?(ranges_prediction, idx_query)
          coverage[idx_hit] += 1
        end
      end

      ranges_prediction.push((hsp.match_query_from..hsp.match_query_to))
    end
    overlap = coverage.reject { |x| x == 0 }
    if overlap != []
      averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
    end
  end

  # if all hsps match only one time
  if averages.reject { |x| x == 1 } == []
    @validation_report = DuplicationValidationOutput.new(@short_header,
                                                         @header,
                                                         @description, 1,
                                                         averages)
    @validation_report.run_time = Time.now - start
    return @validation_report
  end

  pval = wilcox_test(averages)

  @validation_report = DuplicationValidationOutput.new(@short_header,
                                                       @header,
                                                       @description, pval,
                                                       averages)
  @run_time = Time.now - start
  @validation_report

rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue NoMafftInstallationError
  @validation_report = ValidationReport.new('Mafft error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoMafftInstallationError
rescue NoInternetError
  @validation_report = ValidationReport.new('Internet error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoInternetError
rescue
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end

#wilcox_test(averages) ⇒ Object

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!



253
254
255
256
257
258
259
# File 'lib/genevalidator/validation_duplication.rb', line 253

def wilcox_test(averages)
  wilcox = Statsample::Test.wilcoxon_signed_rank(Daru::Vector.new(averages),
                                                 Daru::Vector.new(Array.new(averages.length,
                                                           1)))

  (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z
end