Class: GeneValidator::Cluster

Inherits:
Object
  • Object
show all
Defined in:
lib/genevalidator/clusterization.rb

Overview

Stores the values belonging to one cluster Used for clusterization among a vector of values

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lengths) ⇒ Cluster

Returns a new instance of Cluster.



158
159
160
# File 'lib/genevalidator/clusterization.rb', line 158

def initialize(lengths)
  @lengths = lengths
end

Instance Attribute Details

#lengthsObject

a hash map containing the pair (length, no_occurences)



156
157
158
# File 'lib/genevalidator/clusterization.rb', line 156

def lengths
  @lengths
end

Instance Method Details

#add(cluster) ⇒ Object

Merges the current cluster with the one given as parameter clusters vector of Cluster objects



267
268
269
270
271
# File 'lib/genevalidator/clusterization.rb', line 267

def add(cluster)
  cluster.lengths.each do |elem|
    lengths[elem[0]] = elem[1]
  end
end

#densityObject

Returns the density of the cluster: how many values it contains



177
178
179
180
181
182
183
# File 'lib/genevalidator/clusterization.rb', line 177

def density
  d = 0
  lengths.each do |elem|
    d += elem[1]
  end
  d
end

#deviation(clusters, queryLength) ⇒ Object

Returns the deviation of a value from the values in all clusters Params: clusters: a list of Cluster objects queryLength: a reference Sequence object Output: Real number



255
256
257
258
259
260
261
262
# File 'lib/genevalidator/clusterization.rb', line 255

def deviation(clusters, queryLength)
  hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
  raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
  R.eval("sd = sd(c(#{raw_hits}))")
  sd = R.pull('sd')
  sd = standard_deviation(hits)
  (queryLength - mean).abs / sd
end

#distance(cluster, method = 0) ⇒ Object

Returns the euclidian distance between the current cluster and the one given as parameter Params: cluster: Cluster object method: 0 or 1 method = 0: do not into condseideration duplicate values method = 1: average linkage clusterization



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/genevalidator/clusterization.rb', line 192

def distance(cluster, method = 0)
  d = 0
  norm = 0

  cluster.lengths.each do |elem1|
    lengths.each do |elem2|
      if method == 1
        d += elem1[1] * elem2[1] * (elem1[0] - elem2[0]).abs
        norm += elem1[1] * elem2[1]
      else
        d += (elem1[0] - elem2[0]).abs
        norm = cluster.lengths.length * lengths.length
      end
    end
  end

  # group average distance
  d /= (norm + 0.0)
  d.round(4)
end

#get_limitsObject

Returns the interval limits of the current cluster



285
286
287
# File 'lib/genevalidator/clusterization.rb', line 285

def get_limits
  lengths.map { |elem| elem[0] }.minmax
end

#inside_cluster(value) ⇒ Object

Returns whether the value is inside the cluster Params: value: value to compare Output: :ok or :shorter or :longer



295
296
297
298
299
300
301
302
303
# File 'lib/genevalidator/clusterization.rb', line 295

def inside_cluster(value)
  limits = get_limits
  left = limits[0]
  right = limits[1]

  :ok if left <= value && right >= value
  :shorter if left >= value
  :longer if right <= value
end

#meanObject

Returns the weighted mean value of the cluster



164
165
166
167
168
169
170
171
172
173
# File 'lib/genevalidator/clusterization.rb', line 164

def mean
  mean_len = 0
  weight = 0

  lengths.each do |length, n|
    mean_len += length * n
    weight += n
  end
  mean_len /= weight
end

Prints the current cluster



275
276
277
278
279
280
281
# File 'lib/genevalidator/clusterization.rb', line 275

def print
  warn "Cluster: mean = #{mean}, density = #{density}"
  lengths.sort { |a, b| a <=> b }.each do |elem|
    warn "#{elem[0]}, #{elem[1]}"
  end
  warn '--------------------------'
end

#standard_deviation(lengths = nil) ⇒ Object

Returns the standard deviation of a set of values Params: lengths: a vector of values (optional, by default it takes the values in the cluster) Output: Real number



235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/genevalidator/clusterization.rb', line 235

def standard_deviation(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  std_deviation = 0
  lengths.each do |len|
    std_deviation += (cluster_mean - len) * (cluster_mean - len)
  end
  std_deviation = Math.sqrt(std_deviation.to_f / (lengths.length - 1))
end

#wss(lengths = nil) ⇒ Object

Returns within cluster sum of squares



215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/genevalidator/clusterization.rb', line 215

def wss(lengths = nil)
  if lengths.nil?
    lengths = @lengths.map { |x| Array.new(x[1], x[0]) }.flatten
  end

  cluster_mean = mean
  ss = 0
  lengths.each do |len|
    ss += (cluster_mean - len) * (cluster_mean - len)
  end
  ss
end