Class: Fastout::Ranker

Inherits:
Object
  • Object
show all
Defined in:
lib/fastout/ranker.rb

Defined Under Namespace

Classes: Point

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data) ⇒ Ranker

takes a 2-d array, data, where the rows are data points and the columns are the attributes, values should all be numerical

  • data should not be empty or nil will be returned

  • also generates minimum and maximum values for each attribute for later use



73
74
75
76
77
78
79
# File 'lib/fastout/ranker.rb', line 73

def initialize data
  raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
  @data = data
  @points = self.class.pointify data
  @minimums, @maximums = compute_minimums_and_maximums
  Point.next_id = 0
end

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



63
64
65
# File 'lib/fastout/ranker.rb', line 63

def data
  @data
end

#maximumsObject (readonly)

Returns the value of attribute maximums.



63
64
65
# File 'lib/fastout/ranker.rb', line 63

def maximums
  @maximums
end

#minimumsObject (readonly)

Returns the value of attribute minimums.



63
64
65
# File 'lib/fastout/ranker.rb', line 63

def minimums
  @minimums
end

#pointsObject (readonly)

Returns the value of attribute points.



63
64
65
# File 'lib/fastout/ranker.rb', line 63

def points
  @points
end

Class Method Details

.pointify(data) ⇒ Object



65
66
67
# File 'lib/fastout/ranker.rb', line 65

def self.pointify data
  data.map {|attributes| Point.new *attributes }
end

Instance Method Details

#assign_points_to_bins!(bin_widths, bin_count) ⇒ Object

assign each of the data points to a bin based on the given bin_widths, returns a 2-d array in attribute-major order



200
201
202
203
204
205
206
# File 'lib/fastout/ranker.rb', line 200

def assign_points_to_bins! bin_widths, bin_count
  bin_widths.each_with_index do |bin_width, attribute_index|
    points.each do |point|
      point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
    end
  end
end

#bin_index(point, attribute_index, bin_width) ⇒ Object



208
209
210
211
212
213
214
215
216
# File 'lib/fastout/ranker.rb', line 208

def bin_index point, attribute_index, bin_width
  minimum = @minimums[attribute_index]
  maximum = @maximums[attribute_index]

  value = point[attribute_index]
  index = ((value - minimum) / bin_width).floor

  value == maximum ? index - 1 : index
end

#calculate_theta(sample, k, n, q) ⇒ Object

find and rank the points by their outlier score and determine theta (the number of points with an outlier score of n)



115
116
117
118
119
120
# File 'lib/fastout/ranker.rb', line 115

def calculate_theta sample, k, n, q
  s = ranked_outliers sample, k, q
  theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }

  [theta, s]
end

#cluster_neighbors(point, cluster, attribute_indexes, bin_widths) ⇒ Object

find all unclustered points that are neighbors of point on all selected attributes or neighbors in the neighborhood of point; find recursively until no additions can be made



169
170
171
# File 'lib/fastout/ranker.rb', line 169

def cluster_neighbors point, cluster, attribute_indexes, bin_widths
  recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
end

#compute_bin_count(q) ⇒ Object

compute the number of bins for a given q



240
241
242
243
# File 'lib/fastout/ranker.rb', line 240

def compute_bin_count q
  count = (@data.size / q.to_f).ceil
  count < 2 ? 2 : count
end

#compute_bin_widths(q, bin_count) ⇒ Object

determine the widths of the bins based on q



233
234
235
236
237
# File 'lib/fastout/ranker.rb', line 233

def compute_bin_widths q, bin_count
  (0...@data.first.size).map do |attribute_index|
    (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
  end
end

#compute_minimums_and_maximumsObject



218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/fastout/ranker.rb', line 218

def compute_minimums_and_maximums
  minimums = @data.first.dup
  maximums = @data.first.dup

  @data.each do |attributes|
    attributes.each_with_index do |attribute, attribute_index|
      minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
      maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
    end
  end

  [minimums, maximums]
end

#optimized_ranking(sample, n, theta_target = 1) ⇒ Object

searches the parameter space to find the optimized values of k and q

  • theta_target is the maximum acceptable value of theta, default is 1

  • sample is the number of iterations to perform in estimating the parameters

  • n is the number of points to rank



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/fastout/ranker.rb', line 85

def optimized_ranking sample, n, theta_target=1
  k = 3
  q = 5
  max_q = n / 4
  step_q = 10
  last_theta = n
  theta, s = calculate_theta(sample, k, n, q)

  while (theta > theta_target or theta < last_theta or q < max_q) do
    return s if (theta <= theta_target)

    if (theta >= last_theta)
      # effectiveness declining so try next k
      k += 1
      q -= step_q
      last_theta = n
    else
      # try next q
      q += step_q
      last_theta = theta
    end

    theta, s = calculate_theta(sample, k, n, q)
  end

  s
end

#random_attribute_indexes(number) ⇒ Object

randomly choose number of attribute indexes



162
163
164
# File 'lib/fastout/ranker.rb', line 162

def random_attribute_indexes number
  (0...@data.first.size).sort_by { rand }[0..number]
end

#ranked_outliers(sample_size, k, q) ⇒ Object

chooses k random attributes with an average of q data points in each bin sample times to determine outliers



124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/fastout/ranker.rb', line 124

def ranked_outliers sample_size, k, q
  # determine number of bins and their widths
  bin_count =  compute_bin_count(q)
  bin_widths = compute_bin_widths(q, bin_count)

  # assign points to the attribute bins
  assign_points_to_bins! bin_widths, bin_count

  1.upto(sample_size) {
    score_points_from_a_random_set_of_attributes! k, bin_widths }

  points.sort_by(&:score).reverse
end

#recursively_cluster_neighbors(point, cluster, attribute_indexes, bin_widths, neighbors) ⇒ Object

recursive step of #cluster_neighbors



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/fastout/ranker.rb', line 174

def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
  fruitful = false

  unclustered_points.each do |unclustered_point|
    next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
                unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)

    fruitful = true
    unclustered_point.cluster = cluster
    neighbors << unclustered_point
  end

  if fruitful
    recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
  else
    neighbors
  end
end

#score_points_from_a_random_set_of_attributes!(number_of_attributes_to_choose, all_bin_widths) ⇒ Object

pick a random set of attributes and compute the outlier score for each of the points



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/fastout/ranker.rb', line 140

def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
  cluster = 0
  attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
  bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }

  points.each do |point|
    next if point.clustered?

    point.cluster = (cluster += 1)
    neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths

    point.uncluster!  if neighbors.empty?
  end

  points.each do |point|
    next unless point.clustered?
    point.uncluster!
    point.score += 1
  end
end

#unclustered_pointsObject

find all of the points that don’t already belong to a cluster



194
195
196
# File 'lib/fastout/ranker.rb', line 194

def unclustered_points
  points.select {|point| not point.clustered? }
end