Class: GeneValidator::DuplicationValidation

Inherits:
ValidationTest show all
Extended by:
Forwardable
Defined in:
lib/genevalidator/validation_duplication.rb

Overview

This class contains the methods necessary for finding duplicated subsequences in the predicted gene

Instance Attribute Summary collapse

Attributes inherited from ValidationTest

#cli_name, #description, #header, #hits, #prediction, #run_time, #short_header, #type, #validation_report

Instance Method Summary collapse

Constructor Details

#initialize(prediction, hits) ⇒ DuplicationValidation

Returns a new instance of DuplicationValidation.



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/genevalidator/validation_duplication.rb', line 90

def initialize(prediction, hits)
  super
  @short_header      = 'Duplication'
  @header            = 'Duplication'
  @description       = 'Check whether there is a duplicated subsequence' \
                       ' in the predicted gene by counting the hsp' \
                       ' residue coverage of the prediction, for each hit.'
  @cli_name          = 'dup'
  @raw_seq_file      = opt[:raw_sequences]
  @index_file_name   = config[:raw_seq_file_index]
  @raw_seq_file_load = config[:raw_seq_file_load]
  @db                = opt[:db]
  @num_threads       = opt[:mafft_threads]
  @type              = config[:type]
end

Instance Attribute Details

#index_file_nameObject (readonly)

Returns the value of attribute index_file_name.



87
88
89
# File 'lib/genevalidator/validation_duplication.rb', line 87

def index_file_name
  @index_file_name
end

#raw_seq_fileObject (readonly)

Returns the value of attribute raw_seq_file.



86
87
88
# File 'lib/genevalidator/validation_duplication.rb', line 86

def raw_seq_file
  @raw_seq_file
end

#raw_seq_file_loadObject (readonly)

Returns the value of attribute raw_seq_file_load.



88
89
90
# File 'lib/genevalidator/validation_duplication.rb', line 88

def raw_seq_file_load
  @raw_seq_file_load
end

Instance Method Details

#check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage, ranges_prediction) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/genevalidator/validation_duplication.rb', line 226

def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage,
                            ranges_prediction)
  # for each hsp of the curent hit
  # iterate through the alignment and count the matching residues
  [*(0..hit_alignment.length - 1)].each do |i|
    residue_hit   = hit_alignment[i]
    residue_query = query_alignment[i]
    next if [' ', '+', '-'].include?(residue_hit)
    next if residue_hit != residue_query
    # indexing in blast starts from 1
    idx_hit   = i + (hsp.hit_from - 1) -
                hit_alignment[0..i].scan(/-/).length
    idx_query = i + (hsp.match_query_from - 1) -
                query_alignment[0..i].scan(/-/).length
    coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query)
  end
  coverage
end

#find_local_alignment(hit, prediction, hsp) ⇒ Object

Only run if the BLAST output does not contain hit alignmment



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/genevalidator/validation_duplication.rb', line 199

def find_local_alignment(hit, prediction, hsp)
  # indexing in blast starts from 1
  hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
  query_local = prediction.raw_sequence[hsp.match_query_from -
                                        1..hsp.match_query_to - 1]

  # in case of nucleotide prediction sequence translate into protein
  # use translate with reading frame 1 because
  # to/from coordinates of the hsp already correspond to the
  # reading frame in which the prediction was read to match this hsp
  if @type == :nucleotide
    s = Bio::Sequence::NA.new(query_local)
    query_local = s.translate
  end

  opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
         '--thread', @num_threads.to_s]
  mafft = Bio::MAFFT.new('mafft', opt)

  # local alignment for hit and query
  seqs = [hit_local, query_local]
  report = mafft.query_align(seqs)
  report.alignment.map(&:to_s)
rescue StandardError
  raise NoMafftInstallationError
end

#in_range?(ranges, idx) ⇒ Boolean

Returns:

  • (Boolean)


245
246
247
248
# File 'lib/genevalidator/validation_duplication.rb', line 245

def in_range?(ranges, idx)
  ranges.each { |range| return true if range.member?(idx) }
  false
end

#run(n = 10) ⇒ Object

Check duplication in the first n hits Output: DuplicationValidationOutput object



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/genevalidator/validation_duplication.rb', line 110

def run(n = 10)
  raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
  raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
               hits[0].is_a?(Query)

  start = Time.new
  # get the first n hits
  n_hits = [n - 1, @hits.length].min
  less_hits = @hits[0..n_hits]
  # get raw sequences for less_hits
  less_hits.delete_if do |hit|
    if hit.raw_sequence.nil?
      hit.raw_sequence = FetchRawSequences.run(hit.identifier,
                                               hit.accession_no)
    end
    hit.raw_sequence.nil? ? true : false
  end

  raise NoInternetError if less_hits.length.zero?

  averages = []

  less_hits.each do |hit|
    coverage = Array.new(hit.length_protein, 0)
    # each residue of the protein should be evluated once only
    ranges_prediction = []

    hit.hsp_list.each do |hsp|
      # align subsequences from the hit and prediction that match
      if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
        hit_alignment   = hsp.hit_alignment
        query_alignment = hsp.query_alignment
      else
        align = find_local_alignment(hit, prediction, hsp)
        hit_alignment   = align[0]
        query_alignment = align[1]
      end

      coverage = check_multiple_coverage(hit_alignment, query_alignment,
                                         hsp, coverage, ranges_prediction)

      ranges_prediction << (hsp.match_query_from..hsp.match_query_to)
    end
    overlap = coverage.reject(&:zero?)
    if overlap != []
      averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
    end
  end

  # if all hsps match only one time
  if averages.reject { |x| x == 1 } == []
    @validation_report = DuplicationValidationOutput.new(@short_header,
                                                         @header,
                                                         @description, 1,
                                                         averages)
    @validation_report.run_time = Time.now - start
    return @validation_report
  end

  pval = wilcox_test(averages)

  @validation_report = DuplicationValidationOutput.new(@short_header,
                                                       @header,
                                                       @description, pval,
                                                       averages)
  @run_time = Time.now - start
  @validation_report
rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue NoMafftInstallationError
  @validation_report = ValidationReport.new('Mafft error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoMafftInstallationError
rescue NoInternetError
  @validation_report = ValidationReport.new('Internet error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push NoInternetError
rescue StandardError
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end

#wilcox_test(averages) ⇒ Object

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!



253
254
255
256
257
258
259
260
# File 'lib/genevalidator/validation_duplication.rb', line 253

def wilcox_test(averages)
  wilcox = Statsample::Test.wilcoxon_signed_rank(
    Daru::Vector.new(averages),
    Daru::Vector.new(Array.new(averages.length, 1))
  )

  averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z
end