Class: GeneValidator::GeneMergeValidation

Inherits:
ValidationTest show all
Extended by:
Forwardable
Defined in:
lib/genevalidator/validation_gene_merge.rb

Overview

This class contains the methods necessary for checking whether there is evidence that the prediction is a merge of multiple genes

Instance Attribute Summary collapse

Attributes inherited from ValidationTest

#cli_name, #description, #header, #run_time, #short_header, #type, #validation_report

Instance Method Summary collapse

Constructor Details

#initialize(prediction, hits, boundary = 10) ⇒ GeneMergeValidation

Initilizes the object Params: prediction: a Sequence object representing the blast query hits: a vector of Sequence objects (representing blast hits) plot_path: name of the input file, used when generatig the plot files boundary: the offset of the hit from which we start analysing the hit



106
107
108
109
110
111
112
113
114
# File 'lib/genevalidator/validation_gene_merge.rb', line 106

def initialize(prediction, hits, boundary = 10)
  super
  @short_header = 'GeneMerge'
  @header       = 'Gene Merge'
  @description  = 'Check whether BLAST hits make evidence about a merge' \
                  ' of two genes that match the predicted gene.'
  @cli_name     = 'merge'
  @boundary     = boundary
end

Instance Attribute Details

#hitsObject (readonly)

Returns the value of attribute hits.



97
98
99
# File 'lib/genevalidator/validation_gene_merge.rb', line 97

def hits
  @hits
end

#predictionObject (readonly)

Returns the value of attribute prediction.



96
97
98
# File 'lib/genevalidator/validation_gene_merge.rb', line 96

def prediction
  @prediction
end

Instance Method Details

#plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits) ⇒ Object

Generates a json file containing data used for plotting the start/end of the matched region offsets in the prediction Param slope: slope of the linear regression line y_intercept: the ecuation of the line is y= slope*x + y_intercept output: location where the plot will be saved in jped file format hits: array of Sequence objects



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/genevalidator/validation_gene_merge.rb', line 229

def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
  pairs = hits.map do |hit|
    Pair.new(hit.hsp_list.map(&:match_query_from).min,
             hit.hsp_list.map(&:match_query_to).max)
  end

  data = hits.map do |hit|
    { 'x' => hit.hsp_list.map(&:match_query_from).min,
      'y' => hit.hsp_list.map(&:match_query_to).max,
      'color' => 'red' }
  end

  Plot.new(data,
           :scatter,
           'Gene Merge Validation: Start/end of matching hit coord. on' \
           ' query (1 point/hit)',
           '',
           'Start Offset (most left hsp)',
           'End Offset (most right hsp)',
           y_intercept.to_s,
           slope.to_s)
end

#plot_matched_regions(hits = @hits) ⇒ Object

Generates a json file containing data used for plotting the matched region of the prediction for each hit Param output: location where the plot will be saved in jped file format hits: array of Sequence objects prediction: Sequence objects



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/genevalidator/validation_gene_merge.rb', line 190

def plot_matched_regions(hits = @hits)
  no_lines = hits.length

  hits_less = hits[0..[no_lines, hits.length - 1].min]

  data = hits_less.each_with_index.map do |hit, i|
    { 'y' => i,
      'start' => hit.hsp_list.map(&:match_query_from).min,
      'stop' => hit.hsp_list.map(&:match_query_to).max,
      'color' => 'black',
      'dotted' => 'true' }
  end .flatten +
         hits_less.each_with_index.map do |hit, i|
           hit.hsp_list.map do |hsp|
             { 'y' => i,
               'start' => hsp.match_query_from,
               'stop' => hsp.match_query_to,
               'color' => 'orange' }
           end
         end .flatten

  Plot.new(data,
           :lines,
           'Gene Merge Validation: Query coord covered by blast hit' \
           ' (1 line/hit)',
           '',
           'Offset in Prediction',
           'Hit Number',
           hits_less.length)
end

#runObject

Validation test for gene merge Output: GeneMergeValidationOutput object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/genevalidator/validation_gene_merge.rb', line 120

def run
  raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
  raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)

  start = Time.now

  pairs = hits.map do |hit|
    Pair.new(hit.hsp_list.map(&:match_query_from).min,
             hit.hsp_list.map(&:match_query_to).max)
  end
  xx_0 = pairs.map(&:x)
  yy_0 = pairs.map(&:y)

  # minimum start shoud be at 'boundary' residues
  xx = xx_0.map { |x| x < @boundary ? @boundary : x }

  # maximum end should be at length - 'boundary' residues
  yy = yy_0.map do |y|
    if y > @prediction.raw_sequence.length - @boundary
      @prediction.raw_sequence.length - @boundary
    else
      y
    end
  end

  line_slope = slope(xx, yy, (1..hits.length).map { |x| 1 / (x + 0.0) })
  ## YW - what is this weighting?

  unimodality = false
  if unimodality_test(xx, yy)
    unimodality = true
    lm_slope = 0.0
  else
    lm_slope = line_slope[1]
  end

  y_intercept = line_slope[0]

  @validation_report = GeneMergeValidationOutput.new(@short_header, @header,
                                                     @description, lm_slope,
                                                     unimodality)
  plot1 = if unimodality
            plot_2d_start_from
          else
            plot_2d_start_from(lm_slope, y_intercept)
          end

  @validation_report.plot_files.push(plot1)
  plot2 = plot_matched_regions
  @validation_report.plot_files.push(plot2)
  @validation_report.run_time = Time.now - start
  @validation_report
rescue NotEnoughHitsError
  @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                            @short_header, @header,
                                            @description)
rescue StandardError
  @validation_report = ValidationReport.new('Unexpected error', :error,
                                            @short_header, @header,
                                            @description)
  @validation_report.errors.push 'Unexpected Error'
end

#slope(xx, yy, weights = nil) ⇒ Object

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx: Array of integers yy : Array of integers weights: Array of integers Output: The ecuation of the regression line: [y slope]



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# File 'lib/genevalidator/validation_gene_merge.rb', line 261

def slope(xx, yy, weights = nil)
  weights = Array.new(hits.length, 1) if weights.nil?

  # calculate the slope
  xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] }
  yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] }

  denominator = weights.reduce(0) { |a, e| a + e }

  x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)
  y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)

  numerator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean))
  end

  denominator = (0...xx.length).reduce(0) do |a, e|
    a + (weights[e] * ((xx[e] - x_mean)**2))
  end

  slope = numerator / (denominator + 0.0)
  y_intercept = y_mean - (slope * x_mean)

  [y_intercept, slope]
end

#slope_statsample(xx, yy) ⇒ Object

Caclulates the slope of the regression line give a set of 2d coordonates of the start/stop offests of the hits Param xx : Array of integers yy : Array of integers Output: The ecuation of the regression line: [y slope]



295
296
297
298
# File 'lib/genevalidator/validation_gene_merge.rb', line 295

def slope_statsample(xx, yy)
  sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
  [sr.a, sr.b]
end

#unimodality_test(xx, yy) ⇒ Object

xx and yy are the projections of the 2-d data on the two axes



302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# File 'lib/genevalidator/validation_gene_merge.rb', line 302

def unimodality_test(xx, yy)
  mean_x = xx.mean
  median_x = xx.median
  mode_x = xx.mode
  sd_x = xx.standard_deviation

  if sd_x == 0
    cond1_x = true
    cond2_x = true
    cond3_x = true
  else
    cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6)
    cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
    cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
  end

  mean_y = yy.mean
  median_y = yy.median
  mode_y = yy.mode
  sd_y = yy.standard_deviation

  if sd_y == 0
    cond1_y = true
    cond2_y = true
    cond3_y = true
  else
    cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6)
    cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
    cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
  end

  if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y
    true
  else
    false
  end
end