Class: MiniStat::Data

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/ministat/data.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(d) ⇒ Data

Returns a new instance of Data.



9
10
11
12
13
# File 'lib/ministat/data.rb', line 9

def initialize(d)
  @data   = d.map { |n| n.to_f }.sort
  @sorted = true
  clear_results
end

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



7
8
9
# File 'lib/ministat/data.rb', line 7

def data
  @data
end

Instance Method Details

#<<(obj) ⇒ Object



15
16
17
18
19
20
21
22
# File 'lib/ministat/data.rb', line 15

def <<(obj)
  throw "#{obj.to_s} is not numeric" unless obj.to_f
  @data << obj
  # force computation!
  clear_results
  @sorted = false
  obj
end

#clear_resultsObject



28
29
30
31
# File 'lib/ministat/data.rb', line 28

def clear_results
  @q1, @q3, @iqr, @outliers, @std_dev, @variance  = nil
  @mode, @harmonic_mean, @geometric_mean          = nil
end

#each(&block) ⇒ Object



24
25
26
# File 'lib/ministat/data.rb', line 24

def each(&block)
  @data.each(&block)
end

#geometric_meanObject

Geometric mean. Only applies to non-negative numbers, and relates to log-normal distribution.



135
136
137
138
139
140
141
142
143
144
# File 'lib/ministat/data.rb', line 135

def geometric_mean
  if @data.any? { |x| x < 0 }
    raise "Geometric mean only applies to non-negative data"
  end

  @geometric_mean ||= 
    2 ** (mean @data.map { |x| Math.log2(x) })
    # this overflowed for dataset with large numbers
    # (@data.inject(1) {|i,j| i *= j})**(1.0/@data.size)
end

#harmonic_meanObject

Harmonic or subcontrary mean. Tends strongly toward the least elements of the dataset.



149
150
151
152
# File 'lib/ministat/data.rb', line 149

def harmonic_mean
  @harmonic_mean ||=
    @data.size.to_f / (@data.inject(0) {|i,j| i += (1.0/j)})
end

#histObject

Put the histogram into a string if we have it



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/ministat/data.rb', line 156

def hist
  if defined? @hist
    # this is a textbook example of how to lie with statistics...
    # TODO: iterate over a range rather than @hist.keys--a histogram
    # produced out of the keys won't properly represent flat spots
    # with no data. or something like that. do as i say, not as i do.
    #
    # this code borrows liberally from the ruby cookbook, recipe 5.12
    # ORA, 2006
    pairs = @hist.keys.map { |x| [x.to_s, @hist[x]] }.sort
    largest_key_size = pairs.max { |x,y| x[0].size <=> y[0].size }[0].size
    pairs.inject("") do |s,kv|
    s<< "#{kv[0].ljust(largest_key_size)} |#{char*kv[1]}\n"
  end
  end
end

#iqrObject

Interquartile range, ie, the middle 50% of the data.



81
82
83
# File 'lib/ministat/data.rb', line 81

def iqr
  @iqr ||= q3 - q1
end

#mean(data = @data) ⇒ Object

Computes arthmetic mean (most common average).



96
97
98
# File 'lib/ministat/data.rb', line 96

def mean(data=@data)
  @mean = (data.inject(0) {|i,j| i += j}) / data.size
end

#median(data = @data) ⇒ Object

Return the median of data. Naive implementaion – does a sort on the data.



36
37
38
39
40
41
42
43
44
45
46
# File 'lib/ministat/data.rb', line 36

def median(data=@data)
  unless @sorted and data == @data
    data.sort!
    @sorted = true
  end
  if data.size % 2 == 0
    return (data[data.size / 2.0 - 1] + data[(data.size / 2.0)]) / 2.0
  else 
    return data[(data.size - 1)/2.0]
  end
end

#modeObject

Computes mode and generates a histogram (for free!). (We needed it anyway).



103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/ministat/data.rb', line 103

def mode
  @hist     ||= {}
  @max_freq ||= 0
  @mode     ||= nil
  unless @mode
    @data.each do |val|
      @hist[val] ||= 0
      @hist[val] += 1
      @max_freq, @mode = @hist[val], val if @hist[val] > @max_freq
    end
  end
  @mode
end

#outliersObject

Returns an array of outlying data points.



87
88
89
90
91
92
# File 'lib/ministat/data.rb', line 87

def outliers
  @outliers ||= 
    @data.map do |i|
      i  if (i < q1 - (1.5 * iqr) or i > q3 + (1.5 * iqr))
    end.compact
end

#partition(pivot, data = @data) ⇒ Object

Partition a set of numbers about pivot



50
51
52
53
54
55
56
57
58
# File 'lib/ministat/data.rb', line 50

def partition(pivot, data=@data)
  low  = []
  high = []
  data.each do |i|
    high.push(i) if i > pivot
    low.push(i)  if i < pivot
  end
  return {:low => low, :high => high}
end

#q1Object

First quartile.



69
70
71
# File 'lib/ministat/data.rb', line 69

def q1
  @q1 ||= median(partition(median(@data), @data)[:low])
end

#q3Object

Third quartile



75
76
77
# File 'lib/ministat/data.rb', line 75

def q3
  @q3 ||= median(partition(median(@data), @data)[:high])
end

#std_devObject

Standard deviation. Square root of variance, measure of the spread of the data about the mean.



128
129
130
# File 'lib/ministat/data.rb', line 128

def std_dev
  @std_dev ||= Math.sqrt(variance)
end

#to_sObject

Return a string with statisical info about a dataset.



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/ministat/data.rb', line 174

def to_s
  <<-DATA_STR    
    Partition:#{partition(median).inspect} 
    Mean:#{mean}
    Geometric Mean:#{geometric_mean}
    Harmonic Mean:#{harmonic_mean}
    Median:#{median} 
    Min:#{data.min} 
    Q1:#{q1}
    Q3:#{q3}
    Max:#{data.max}
    IQR:#{iqr}
    Outliers:#{outliers.inspect}
    Variance:#{variance} 
    Std Dev:#{std_dev}
  DATA_STR
end

#varianceObject

Computes variance. Used to measure degree of spread in dataset.



120
121
122
123
# File 'lib/ministat/data.rb', line 120

def variance
  @variance ||= 
    @data.inject(0) { |i,j| i += (j - mean(@data)) ** 2}  / (@data.size - 1)
end