Module: ColumnsMatcher::Statistics

Defined in:
lib/columns_matcher/statistics.rb

Class Method Summary collapse

Class Method Details

.entropy(values) ⇒ Object

values is an array of values Returns entropy statistic



7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/columns_matcher/statistics.rb', line 7

def self.entropy(values)
	r = Rserve::Simpler.new

	r.converse("library(\"entropy\")")

	numeric_values = Converter::convert_from_array_to_numeric_array(values)

	values_in_r_vector = Converter::convert_from_array_to_r_vector(numeric_values)

	entropy = r.converse("entropy(table(#{values_in_r_vector}))")

	entropy
end

.euclidian_distance(first_vector, second_vector) ⇒ Object

first_vector and second_vector are two arrays of two potentially different sizes Returns Euclidian distance between the two vectors



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/columns_matcher/statistics.rb', line 41

def self.euclidian_distance(first_vector, second_vector)
 	sum = 0

 	size = (first_vector.size >= second_vector.size) ? first_vector.size : second_vector.size

 	equalized_first_vector = Array.new(size) { 0 }.fill {|index| first_vector[index] }
 	equalized_second_vector = Array.new(size) { 0 }.fill {|index| second_vector[index] }

 	equalized_first_vector.zip(equalized_second_vector).each do |first, second|
   	component = (first - second) ** 2

   	sum += component
 	end
 	
 	Math.sqrt(sum)
end

.munkres_assignment_algorithm(cost_matrix) ⇒ Object

cost_matrix is a square matrix of costs Returns pairings that optimize costs - lowest overall cost



60
61
62
63
64
# File 'lib/columns_matcher/statistics.rb', line 60

def self.munkres_assignment_algorithm(cost_matrix)
	munkres = Munkres.new(cost_matrix)
    
    pairings = munkres.find_pairings
end

.mutual_information(first_values, second_values) ⇒ Object

first_values and second_values are arrays of values Returns mutual information statistic



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/columns_matcher/statistics.rb', line 23

def self.mutual_information(first_values, second_values)
	r = Rserve::Simpler.new

	r.converse("library(\"entropy\")")

	first_numeric_values = Converter::convert_from_array_to_numeric_array(first_values)
	second_numeric_values = Converter::convert_from_array_to_numeric_array(second_values)

	first_values_in_r_vector = Converter::convert_from_array_to_r_vector(first_numeric_values)
	second_values_in_r_vector = Converter::convert_from_array_to_r_vector(second_numeric_values)

	mutual_information = r.converse("mi.plugin(rbind(#{first_values_in_r_vector}, #{second_values_in_r_vector}))")

	mutual_information
end