Class: Statsample::Vector

Inherits:
Object
  • Object
show all
Includes:
Enumerable, Writable
Defined in:
lib/statsample/vector.rb,
lib/statsample/graph/gdchart.rb,
lib/statsample/graph/svggraph.rb

Overview

Collection of values on one dimension. Works as a column on a Spreadsheet.

Usage

The fast way to create a vector uses Array.to_vector or Array.to_scale.

v=[1,2,3,4].to_vector(:scale)
v=[1,2,3,4].to_scale

Constant Summary collapse

DEFAULT_OPTIONS =

DEFAULT OPTIONS

{
  :missing_values=>[],
  :today_values=>['NOW','TODAY', :NOW, :TODAY],
  :labels=>{}
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Writable

#save

Constructor Details

#initialize(data = [], type = :nominal, opts = Hash.new) ⇒ Vector

Creates a new Vector object.

data

Array of data.

type

Level of meausurement. See Vector#type

opts

Options

:missing_values

Array of missing values. See Vector#missing_values

:today_values

Array of ‘today’ values. See Vector#today_values

:labels

Labels for data values



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/statsample/vector.rb', line 62

def initialize(data=[], type=:nominal, opts=Hash.new)
  raise "Data should be an array" unless data.is_a? Array
  @data=data
  @type=type
  opts=DEFAULT_OPTIONS.merge(opts)
  @missing_values=opts[:missing_values]
  @labels=opts[:labels]
  @today_values=opts[:today_values]
  @valid_data=[]
  @data_with_nils=[]
  @date_data_with_nils=[]
  @missing_data=[]
  @has_missing_data=nil
  @scale_data=nil
  set_valid_data_intern
  self.type=type
end

Instance Attribute Details

#dataObject (readonly)

Original data.



36
37
38
# File 'lib/statsample/vector.rb', line 36

def data
  @data
end

#data_with_nilsObject (readonly)

Original data, with all missing values replaced by nils



46
47
48
# File 'lib/statsample/vector.rb', line 46

def data_with_nils
  @data_with_nils
end

#date_data_with_nilsObject (readonly)

Date date, with all missing values replaced by nils



48
49
50
# File 'lib/statsample/vector.rb', line 48

def date_data_with_nils
  @date_data_with_nils
end

#gslObject (readonly)

GSL Object, only available with rbgsl extension and type==:scale



50
51
52
# File 'lib/statsample/vector.rb', line 50

def gsl
  @gsl
end

#labelsObject

Change label for specific values



52
53
54
# File 'lib/statsample/vector.rb', line 52

def labels
  @labels
end

#missing_dataObject (readonly)

Missing values array



44
45
46
# File 'lib/statsample/vector.rb', line 44

def missing_data
  @missing_data
end

#missing_valuesObject

Array of values considered as missing. Nil is a missing value, by default



40
41
42
# File 'lib/statsample/vector.rb', line 40

def missing_values
  @missing_values
end

#today_valuesObject

Array of values considered as “Today”, with date type. “NOW”, “TODAY”, :NOW and :TODAY are ‘today’ values, by default



42
43
44
# File 'lib/statsample/vector.rb', line 42

def today_values
  @today_values
end

#typeObject

Level of measurement. Could be :nominal, :ordinal or :scale



34
35
36
# File 'lib/statsample/vector.rb', line 34

def type
  @type
end

#valid_dataObject (readonly)

Valid data. Equal to data, minus values assigned as missing values



38
39
40
# File 'lib/statsample/vector.rb', line 38

def valid_data
  @valid_data
end

Class Method Details

._load(data) ⇒ Object

:nodoc:



139
140
141
142
# File 'lib/statsample/vector.rb', line 139

def self._load(data) # :nodoc:
h=Marshal.load(data)
Vector.new(h['data'], h['type'],:missing_values=> h['missing_values'], :labels=>h['labels'])
end

Instance Method Details

#+(v) ⇒ Object

Vector sum.

  • If v is a scalar, add this value to all elements

  • If v is a Array or a Vector, should be of the same size of this vector every item of this vector will be added to the value of the item at the same position on the other vector



310
311
312
# File 'lib/statsample/vector.rb', line 310

def +(v)
_vector_ari("+",v)
end

#-(v) ⇒ Object

Vector rest.

  • If v is a scalar, rest this value to all elements

  • If v is a Array or a Vector, should be of the same size of this vector every item of this vector will be rested to the value of the item at the same position on the other vector



320
321
322
# File 'lib/statsample/vector.rb', line 320

def -(v)
_vector_ari("-",v)
end

#==(v2) ⇒ Object

Vector equality. Two vector will be the same if their data, missing values, type, labels are equals

Raises:

  • (TypeError)


130
131
132
133
# File 'lib/statsample/vector.rb', line 130

def ==(v2)
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
end

#[](i) ⇒ Object

Retrieves i element of data



270
271
272
# File 'lib/statsample/vector.rb', line 270

def [](i)
@data[i]
end

#[]=(i, v) ⇒ Object

Set i element of data. Note: Use set_valid_data if you include missing values



275
276
277
# File 'lib/statsample/vector.rb', line 275

def []=(i,v)
@data[i]=v
end

#_dump(i) ⇒ Object

:nodoc:



135
136
137
# File 'lib/statsample/vector.rb', line 135

def _dump(i) # :nodoc:
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
end

#_frequenciesObject

:nodoc:



546
547
548
549
550
551
552
# File 'lib/statsample/vector.rb', line 546

def _frequencies #:nodoc:
@valid_data.inject(Hash.new) {|a,x|
  a[x]||=0
  a[x]=a[x]+1
  a
}
end

#_set_valid_data_internObject

:nodoc:



230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/statsample/vector.rb', line 230

def _set_valid_data_intern #:nodoc:
@data.each do |n|
  if is_valid? n
    @valid_data.push(n)
    @data_with_nils.push(n)
  else
    @data_with_nils.push(nil)
    @missing_data.push(n)
  end
end
@has_missing_data=@missing_data.size>0
end

#_vector_ari(method, v) ⇒ Object

:nodoc:



334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# File 'lib/statsample/vector.rb', line 334

def _vector_ari(method,v) # :nodoc:
if(v.is_a? Vector or v.is_a? Array)
  if v.size==@data.size
  #                    i=0
  sum=[]
  0.upto(v.size-1) {|i|
      if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
          sum.push(@data[i].send(method,v[i]))
      else
          sum.push(nil)
      end
  }
  Statsample::Vector.new(sum)
  else
  raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
  end
elsif(v.respond_to? method )
  Statsample::Vector.new(
    @data.collect  {|x|
      if(!x.nil?)
        x.send(method,v)
      else
        nil
      end
    }
  )
else
    raise TypeError,"You should pass a scalar or a array/vector"
end

end

#add(v, update_valid = true) ⇒ Object

Add a value at the end of the vector. If second argument set to false, you should update the Vector usign Vector.set_valid_data at the end of your insertion cycle



191
192
193
194
# File 'lib/statsample/vector.rb', line 191

def add(v,update_valid=true)
@data.push(v)
set_valid_data if update_valid
end

#box_cox_transformation(lambda) ⇒ Object

:nodoc:



113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/statsample/vector.rb', line 113

def box_cox_transformation(lambda) # :nodoc:
raise "Should be a scale" unless @type==:scale
@data_with_nils.collect{|x|
if !x.nil?
  if(lambda==0)
    Math.log(x)
  else
    (x**lambda-1).quo(lambda)
  end
else
  nil
end
}.to_vector(:scale)
end

#can_be_date?Boolean

Return true if all data is Date, “today” values or nil

Returns:

  • (Boolean)


492
493
494
495
496
497
498
499
# File 'lib/statsample/vector.rb', line 492

def can_be_date?
if @data.find {|v|       
!v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
  false
else
  true
end
end

#can_be_scale?Boolean

Return true if all data is Numeric or nil

Returns:

  • (Boolean)


501
502
503
504
505
506
507
# File 'lib/statsample/vector.rb', line 501

def can_be_scale?
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
  false
else
  true
end
end

#coefficient_of_variationObject Also known as: cov

Coefficient of variation Calculed with the sample standard deviation



876
877
878
879
# File 'lib/statsample/vector.rb', line 876

def coefficient_of_variation
    check_type :scale
    standard_deviation_sample.quo(mean)
end

#count(x = false) ⇒ Object

Retrieves number of cases which comply condition. If block given, retrieves number of instances where block returns true. If other values given, retrieves the frequency for this value.



465
466
467
468
469
470
471
472
473
474
475
# File 'lib/statsample/vector.rb', line 465

def count(x=false)
if block_given?
  r=@data.inject(0) {|s, i|
    r=yield i
    s+(r ? 1 : 0)
  }
  r.nil? ? 0 : r
else
  frequencies[x].nil? ? 0 : frequencies[x]
end
end

#db_type(dbs = 'mysql') ⇒ Object

Returns the database type for the vector, according to its content



479
480
481
482
483
484
485
486
487
488
489
490
# File 'lib/statsample/vector.rb', line 479

def db_type(dbs='mysql')
# first, detect any character not number
if @data.find {|v|  v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v|  v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
  return "DATE"
elsif @data.find {|v|  v.to_s=~/[^0-9e.-]/ }
  return "VARCHAR (255)"
elsif @data.find {|v| v.to_s=~/\./}
  return "DOUBLE"
else
  return "INTEGER"
end
end

#dichotomize(low = nil) ⇒ Object

Dicotomize the vector with 0 and 1, based on lowest value If parameter if defined, this value and lower will be 0 and higher, 1



161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/statsample/vector.rb', line 161

def dichotomize(low=nil)
fs=factors
low||=factors.min
@data_with_nils.collect{|x|
  if x.nil?
    nil
  elsif x>low
    1
  else
    0
  end
}.to_scale
end

#dupObject

Creates a duplicate of the Vector. Note: data, missing_values and labels are duplicated, so changes on original vector doesn’t propages to copies.



82
83
84
# File 'lib/statsample/vector.rb', line 82

def dup
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
end

#dup_emptyObject

Returns an empty duplicate of the vector. Maintains the type, missing values and labels.



87
88
89
# File 'lib/statsample/vector.rb', line 87

def dup_empty
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup)
end

#eachObject

Iterate on each item. Equivalent to

@data.each{|x| yield x}


177
178
179
# File 'lib/statsample/vector.rb', line 177

def each
  @data.each{|x| yield(x) }
end

#each_indexObject

Iterate on each item, retrieving index



182
183
184
185
186
# File 'lib/statsample/vector.rb', line 182

def each_index
(0...@data.size).each {|i|
  yield(i)
}
end

#factorsObject

Retrieves uniques values for data.



526
527
528
529
530
531
532
533
534
# File 'lib/statsample/vector.rb', line 526

def factors
if @type==:scale
  @scale_data.uniq.sort
elsif @type==:date
  @date_data_with_nils.uniq.sort
else
  @valid_data.uniq.sort
end
end

#frequenciesObject

:nodoc:



538
539
540
# File 'lib/statsample/vector.rb', line 538

def frequencies
  Statsample::STATSAMPLE__.frequencies(@valid_data)
end

#gdchart_frequencies(file, width = 300, height = 150, chart_type = GDChart::BAR, options = {}) ⇒ Object

Creates a barchart using ruby-gdchart



23
24
25
26
27
28
29
30
31
# File 'lib/statsample/graph/gdchart.rb', line 23

def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={}) # :nodoc:
	labels,data=[],[]
	self.frequencies.sort.each{|k,v|
		labels.push(k.to_s)
		data.push(v) 
	}
	options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
	Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
end

#gdchart_histogram(bins, file, width = 300, height = 150, chart_type = GDChart::BAR, options = {}) ⇒ Object

:nodoc:



32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/statsample/graph/gdchart.rb', line 32

def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={}) # :nodoc:
    check_type :scale
    labels=[]
    h=histogram(bins)
    data=[]
    (0...bins).each{|bin|
      data.push(h[bin])
      range=h.get_range(bin)
      labels.push(((range[0]+range[1]) / 2.to_f).to_s)
    }
    Statsample::Util.chart_gdchart(file, width, height, chart_type, labels,options, 1,data)
end

#has_missing_data?Boolean

Retrieves true if data has one o more missing values

Returns:

  • (Boolean)


244
245
246
# File 'lib/statsample/vector.rb', line 244

def has_missing_data?
@has_missing_data
end

#histogram(bins = 10) ⇒ Object

Create a GSL::Histogram With a fixnum, creates X bins within the range of data With an Array, each value will be a cut point



852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
# File 'lib/statsample/vector.rb', line 852

def histogram(bins=10)
  check_type :scale
  
  if bins.is_a? Array
    #h=Statsample::Histogram.new(self, bins)
    h=GSL::Histogram.alloc(bins)                        
  else
    # ugly patch. The upper limit for a bin has the form
    # x < range
    #h=Statsample::Histogram.new(self, bins)
    h=GSL::Histogram.alloc(bins,[@valid_data.min,@valid_data.max+0.0001])
  end
  h.increment(@gsl)
  h
end

#inspectObject



522
523
524
# File 'lib/statsample/vector.rb', line 522

def inspect
  self.to_s
end

#is_valid?(x) ⇒ Boolean

Return true if a value is valid (not nil and not included on missing values)

Returns:

  • (Boolean)


279
280
281
# File 'lib/statsample/vector.rb', line 279

def is_valid?(x)
!(x.nil? or @missing_values.include? x)
end

#kurtosisObject

:nodoc:



791
792
793
794
795
796
797
# File 'lib/statsample/vector.rb', line 791

def kurtosis(m=nil)
    check_type :scale
    m||=mean
    fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
    fo.quo((@scale_data.size)*sd(m)**4)-3
    
end

#labeling(x) ⇒ Object

Retrieves label for value x. Retrieves x if no label defined.



249
250
251
# File 'lib/statsample/vector.rb', line 249

def labeling(x)
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
end

#maxObject

Maximum value



684
685
686
687
# File 'lib/statsample/vector.rb', line 684

def max
  check_type :ordinal
  @valid_data.max;
end

#meanObject

:nodoc:



733
734
735
736
# File 'lib/statsample/vector.rb', line 733

def mean
  check_type :scale
  sum.to_f.quo(n_valid)
end

#medianObject

Return the median (percentil 50)



669
670
671
672
673
674
675
676
677
# File 'lib/statsample/vector.rb', line 669

def median
  check_type :ordinal
  if Statsample.has_gsl? and @type==:scale
    sorted=GSL::Vector.alloc(@scale_data.sort)
    GSL::Stats::median_from_sorted_data(sorted)
  else
    percentil(50)
  end
end

#minObject

Minimun value



679
680
681
682
# File 'lib/statsample/vector.rb', line 679

def min 
  check_type :ordinal
  @valid_data.min;
end

#modeObject

Returns the most frequent item.



581
582
583
# File 'lib/statsample/vector.rb', line 581

def mode
  frequencies.max{|a,b| a[1]<=>b[1]}[0]
end

#n_validObject

The numbers of item with valid data.



585
586
587
# File 'lib/statsample/vector.rb', line 585

def n_valid
  @valid_data.size
end

#percentil(q) ⇒ Object

Return the value of the percentil q



647
648
649
650
651
652
653
654
655
656
# File 'lib/statsample/vector.rb', line 647

def percentil(q)
  check_type :ordinal
  sorted=@valid_data.sort
  v= (n_valid * q).quo(100)
  if(v.to_i!=v)
    sorted[v.to_i]
  else
    (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
  end
end

#plot_frequenciesObject

Plot frequencies on a chart, using gnuplot



554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
# File 'lib/statsample/vector.rb', line 554

def plot_frequencies
require 'gnuplot'
x=[]
y=[]
self.frequencies.sort.each{|k,v|
    x.push(k)
    y.push(v) 
}
Gnuplot.open do |gp|
  Gnuplot::Plot.new( gp ) do |plot|
    plot.boxwidth("0.9 absolute")
    plot.yrange("[0:#{y.max}]")
    plot.style("fill  solid 1.00 border -1")
    plot.set("xtics border in scale 1,0.5 nomirror rotate by -45  offset character 0, 0, 0")
    plot.style("histogram")
    plot.style("data histogram")
    i=-1
    plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
    plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
        end
    end
  end
    
end

#plot_histogram(bins = 10, options = "") ⇒ Object



867
868
869
870
# File 'lib/statsample/vector.rb', line 867

def plot_histogram(bins=10,options="")
    check_type :scale
    self.histogram(bins).graph(options)
end

#productObject

Product of all values on the sample



800
801
802
803
# File 'lib/statsample/vector.rb', line 800

def product
    check_type :scale
    @scale_data.inject(1){|a,x| a*x }
end

#proportion(v = 1) ⇒ Object

Proportion of a given value.



597
598
599
# File 'lib/statsample/vector.rb', line 597

def proportion(v=1)
    frequencies[v].quo(@valid_data.size)
end

#proportion_confidence_interval_t(n_poblation, margin = 0.95, v = 1) ⇒ Object



628
629
630
# File 'lib/statsample/vector.rb', line 628

def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
  Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
end

#proportion_confidence_interval_z(n_poblation, margin = 0.95, v = 1) ⇒ Object



631
632
633
# File 'lib/statsample/vector.rb', line 631

def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
  Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
end

#proportionsObject

Returns a hash with the distribution of proportions of the sample.



590
591
592
593
594
595
# File 'lib/statsample/vector.rb', line 590

def proportions
    frequencies.inject({}){|a,v|
        a[v[0]] = v[1].quo(n_valid)
        a
    }
end

#rangeObject

The range of the data (max - min)



723
724
725
726
# File 'lib/statsample/vector.rb', line 723

def range; 
  check_type :scale
  @scale_data.max - @scale_data.min
end

#ranked(type = :ordinal) ⇒ Object

Returns a ranked vector.



658
659
660
661
662
663
664
665
666
667
# File 'lib/statsample/vector.rb', line 658

def ranked(type=:ordinal)
  check_type :ordinal
  i=0
  r=frequencies.sort.inject({}){|a,v|
    a[v[0]]=(i+1 + i+v[1]).quo(2)
    i+=v[1]
    a
  }
  @data.collect {|c| r[c] }.to_vector(type)
end

#recodeObject

Returns a new vector, with data modified by block. Equivalent to create a Vector after #collect on data



145
146
147
148
149
# File 'lib/statsample/vector.rb', line 145

def recode
@data.collect{|x|
  yield x
}.to_vector(@type)
end

#recode!Object

Modifies current vector, with data modified by block. Equivalent to #collect! on @data



152
153
154
155
156
157
# File 'lib/statsample/vector.rb', line 152

def recode!
@data.collect!{|x|
  yield x
}
set_valid_data
end

#sample_with_replacement(sample = 1) ⇒ Object

Returns an random sample of size n, with replacement, only with valid data.

In all the trails, every item have the same probability of been selected.



429
430
431
432
433
434
435
436
437
# File 'lib/statsample/vector.rb', line 429

def sample_with_replacement(sample=1)
if(@type!=:scale or !Statsample.has_gsl?)
  vds=@valid_data.size
  (0...sample).collect{ @valid_data[rand(vds)] }
else
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
  r.sample(@gsl, sample).to_a
end
end

#sample_without_replacement(sample = 1) ⇒ Object

Returns an random sample of size n, without replacement, only with valid data.

Every element could only be selected once.

A sample of the same size of the vector is the vector itself.



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# File 'lib/statsample/vector.rb', line 445

def sample_without_replacement(sample=1)
if(@type!=:scale or !Statsample.has_gsl?)
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
  out=[]
  size=@valid_data.size
  while out.size<sample
    value=rand(size)
    out.push(value) if !out.include?value
  end
  out.collect{|i|@data[i]}
else
  r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
  r.choose(@gsl, sample).to_a
end
end

#set_valid_dataObject

Update valid_data, missing_data, data_with_nils and gsl at the end of an insertion.

Use after Vector.add(v,false) Usage:

v=Statsample::Vector.new
v.add(2,false)
v.add(4,false)
v.data
=> [2,3]
v.valid_data
=> []
v.set_valid_data
v.valid_data
=> [2,3]


210
211
212
213
214
215
216
217
218
219
# File 'lib/statsample/vector.rb', line 210

def set_valid_data
@valid_data.clear
@missing_data.clear
@data_with_nils.clear
@date_data_with_nils.clear
@gsl=nil
set_valid_data_intern
set_scale_data if(@type==:scale)
set_date_data if(@type==:date)
end

#set_valid_data_internObject

:nodoc:



222
223
224
# File 'lib/statsample/vector.rb', line 222

def set_valid_data_intern #:nodoc:
  Statsample::STATSAMPLE__.set_valid_data_intern(self)
end

#sizeObject Also known as: n

Size of total data



264
265
266
# File 'lib/statsample/vector.rb', line 264

def size
@data.size
end

#skewObject

:nodoc:



784
785
786
787
788
789
# File 'lib/statsample/vector.rb', line 784

def skew(m=nil)
    check_type :scale
    m||=mean
    th=@scale_data.inject(0){|a,x| a+((x-m)**3)}
    th.quo((@scale_data.size)*sd(m)**3)
end

#split_by_separator(sep = Statsample::SPLIT_TOKEN) ⇒ Object

Returns a hash of Vectors, defined by the different values defined on the fields Example:

a=Vector.new(["a,b","c,d","a,b"])
a.split_by_separator
=>  {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 
      @data=[1, 0, 1]>, 
     "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 
      @data=[1, 1, 0]>, 
    "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 
      @data=[0, 1, 1]>}


394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# File 'lib/statsample/vector.rb', line 394

def split_by_separator(sep=Statsample::SPLIT_TOKEN)
split_data=splitted(sep)
factors=split_data.flatten.uniq.compact
out=factors.inject({}) {|a,x|
  a[x]=[]
  a
}
split_data.each do |r|
  if r.nil?
    factors.each do |f|
      out[f].push(nil)
    end
  else
    factors.each do |f|
      out[f].push(r.include?(f) ? 1:0) 
    end
  end
end
out.inject({}){|s,v|
  s[v[0]]=Vector.new(v[1],:nominal)
  s
}
end

#split_by_separator_freq(sep = Statsample::SPLIT_TOKEN) ⇒ Object



417
418
419
420
421
422
# File 'lib/statsample/vector.rb', line 417

def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
split_by_separator(sep).inject({}) {|a,v|
  a[v[0]]=v[1].inject {|s,x| s+x.to_i}
  a
}
end

#splitted(sep = Statsample::SPLIT_TOKEN) ⇒ Object

Return an array with the data splitted by a separator.

a=Vector.new(["a,b","c,d","a,b","d"])
a.splitted
  =>
[["a","b"],["c","d"],["a","b"],["d"]]


370
371
372
373
374
375
376
377
378
379
380
# File 'lib/statsample/vector.rb', line 370

def splitted(sep=Statsample::SPLIT_TOKEN)
@data.collect{|x|
  if x.nil?
    nil
  elsif (x.respond_to? :split)
    x.split(sep)
  else
    [x]
  end
}
end

#standard_deviation_population(m = nil) ⇒ Object Also known as: sdp

:nodoc:



763
764
765
766
# File 'lib/statsample/vector.rb', line 763

def standard_deviation_population(m=nil)
  check_type :scale
  Math::sqrt( variance_population(m) )
end

#standard_deviation_sample(m = nil) ⇒ Object Also known as: sds, sd

:nodoc:



777
778
779
780
781
782
# File 'lib/statsample/vector.rb', line 777

def standard_deviation_sample(m=nil)
    check_type :scale
    
    m||=mean
    Math::sqrt(variance_sample(m))
end

#sumObject

:nodoc:



728
729
730
731
# File 'lib/statsample/vector.rb', line 728

def sum
  check_type :scale
  @scale_data.inject(0){|a,x|x+a} ; 
end

#sum_of_squared_deviationObject

Sum of squared deviation



748
749
750
751
# File 'lib/statsample/vector.rb', line 748

def sum_of_squared_deviation
  check_type :scale
  @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
end

#sum_of_squares(m = nil) ⇒ Object Also known as: ss

Sum of squares for the data around a value. By default, this value is the mean

ss= sum{(xi-m)^2}


741
742
743
744
745
# File 'lib/statsample/vector.rb', line 741

def sum_of_squares(m=nil)
  check_type :scale
  m||=mean
  @scale_data.inject(0){|a,x| a+(x-m).square}
end

#summary(out = "") ⇒ Object



600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
# File 'lib/statsample/vector.rb', line 600

def summary(out="")
  out << sprintf("n valid:%d\n",n_valid)
  out <<  sprintf("factors:%s\n",factors.join(","))
  out <<  "mode:"+mode.to_s+"\n"
  out <<  "Distribution:\n"
  frequencies.sort.each{|k,v|
    key=labels.has_key?(k) ? labels[k]:k
    out <<  sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
  }
  if(@type==:ordinal)
    out << "median:"+median.to_s+"\n"
  end
  if(@type==:scale)
    out << "mean:"+mean.to_s+"\n"
    out << "sd:"+sd.to_s+"\n"
    
  end
  out
end

#svggraph_boxplot(options = {}) ⇒ Object



54
55
56
57
58
59
60
61
# File 'lib/statsample/graph/svggraph.rb', line 54

def svggraph_boxplot(options={})
  check_type :scale
  options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
  vx=@valid_data.to_a.to_vector(:scale)
  graph = Statsample::Graph::SvgBoxplot.new(options)
  graph.add_data(:title=>"vector", :data=>@data.to_a)
  graph
end

#svggraph_frequencies(file, width = 600, height = 300, chart_type = SVG::Graph::BarNoOp, options = {}) ⇒ Object

Creates a barchart using ruby-gdchart



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/statsample/graph/svggraph.rb', line 10

def svggraph_frequencies(file, width=600, height=300, chart_type=SVG::Graph::BarNoOp, options={})
  labels, data1=[],[]
  self.frequencies.sort.each{|k,v|
    labels.push(k.to_s)
    data1.push(v)
  }
  options[:height]=height
  options[:width]=width
  options[:fields]=labels
  graph = chart_type.new(options)
  graph.add_data(
  :data => data1,
  :title => "Frequencies"
  )
  if file.respond_to? :write
    file.write(graph.burn)
  else
    File.open(file.to_s,"wb") {|f|
      f.puts(graph.burn)
    }
  end
  
end

#svggraph_histogram(bins, options = {}) ⇒ Object



34
35
36
37
38
39
40
# File 'lib/statsample/graph/svggraph.rb', line 34

def svggraph_histogram(bins, options={})
  check_type :scale
  options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
  graph = Statsample::Graph::SvgHistogram.new(options)
  graph.histogram=histogram(bins)
  graph
end

#svggraph_lag_plot(options = {}) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
# File 'lib/statsample/graph/svggraph.rb', line 63

def svggraph_lag_plot(options={})
  check_type :scale
  options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
  vx=@valid_data[0...(@valid_data.size-1)].to_vector(:scale)
  vy=@valid_data[1...@valid_data.size].to_vector(:scale)
  ds={'x_minus_1'=>vx,'x'=>vy}.to_dataset
  graph = Statsample::Graph::SvgScatterplot.new(ds,options)
  graph.set_x('x_minus_1')
  graph.parse
  graph
end

#svggraph_normalprobability_plot(options = {}) ⇒ Object

Returns a Normal Probability Plot Reference: www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/statsample/graph/svggraph.rb', line 76

def svggraph_normalprobability_plot(options={})
  extend Statsample::Util
  check_type :scale
  options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
  n=@valid_data.size
  vx=(1..@valid_data.size).to_a.collect{|i|
    Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
  }.to_vector(:scale)
  vy=@valid_data.sort.to_vector(:scale)
  ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset
  graph = Statsample::Graph::SvgScatterplot.new(ds,options)
  graph.set_x('normal_order_statistics_medians')
  graph.parse
  graph
end

#svggraph_runsequence_plot(options = {}) ⇒ Object

Returns a Run-Sequence Plot Reference: www.itl.nist.gov/div898/handbook/eda/section3/runseqpl.htm



43
44
45
46
47
48
49
50
51
52
53
# File 'lib/statsample/graph/svggraph.rb', line 43

def svggraph_runsequence_plot(options={})
  check_type :scale
  options={:graph_title=>"Run-Sequence Plot", :show_graph_title=>true, :scale_x_integers => true, :add_popups=>true }.merge! options
  vx=(1..@data.size).to_a.to_vector(:scale)
  vy=@data.to_vector(:scale)
  ds={'index'=>vx,'value'=>vy}.to_dataset
  graph = Statsample::Graph::SvgScatterplot.new(ds,options)
  graph.set_x('index')
  graph.parse
  graph
end

#to_aObject Also known as: to_ary



300
301
302
# File 'lib/statsample/vector.rb', line 300

def to_a
@data.dup
end

#to_matrix(dir = :horizontal) ⇒ Object

Ugly name. Really, create a Vector for standard ‘matrix’ package. dir could be :horizontal or :vertical



514
515
516
517
518
519
520
521
# File 'lib/statsample/vector.rb', line 514

def to_matrix(dir=:horizontal)
case dir
when :horizontal
  Matrix[@data]
when :vertical
  Matrix.columns([@data])
end
end

#to_sObject



509
510
511
# File 'lib/statsample/vector.rb', line 509

def to_s
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
end

#variance_population(m = nil) ⇒ Object

:nodoc:



754
755
756
757
758
759
# File 'lib/statsample/vector.rb', line 754

def variance_population(m=nil)
  check_type :scale
  m||=mean
  squares=@scale_data.inject(0){|a,x| x.square+a}
  squares.quo(n_valid) - m.square
end

#variance_proportion(n_poblation, v = 1) ⇒ Object

Variance of p, according to poblation size



621
622
623
# File 'lib/statsample/vector.rb', line 621

def variance_proportion(n_poblation, v=1)
  Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
end

#variance_sample(m = nil) ⇒ Object Also known as: variance

:nodoc:



769
770
771
772
773
# File 'lib/statsample/vector.rb', line 769

def variance_sample(m=nil)
  check_type :scale
  m||=mean
  sum_of_squares(m).quo(n_valid - 1)
end

#variance_total(n_poblation, v = 1) ⇒ Object

Variance of p, according to poblation size



625
626
627
# File 'lib/statsample/vector.rb', line 625

def variance_total(n_poblation, v=1)
  Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
end

#vector_labeledObject

Returns a Vector with data with labels replaced by the label.



253
254
255
256
257
258
259
260
261
262
# File 'lib/statsample/vector.rb', line 253

def vector_labeled
d=@data.collect{|x|
  if @labels.has_key? x
    @labels[x]
  else
    x
  end
}
Vector.new(d,@type)
end

#vector_standarized(use_population = false) ⇒ Object Also known as: standarized

Return a vector usign the standarized values for data with sd with denominator n-1



99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/statsample/vector.rb', line 99

def vector_standarized(use_population=false)
raise "Should be a scale" unless @type==:scale
m=mean
sd=use_population ? sdp : sds
@data_with_nils.collect{|x|
  if !x.nil?
    (x.to_f - m).quo(sd)
  else
    nil
  end
}.to_vector(:scale)
end

#verifyObject

Reports all values that doesn’t comply with a condition. Returns a hash with the index of data and the invalid data.



325
326
327
328
329
330
331
332
333
# File 'lib/statsample/vector.rb', line 325

def verify
h={}
(0...@data.size).to_a.each{|i|
  if !(yield @data[i])
    h[i]=@data[i]
  end
}
h
end