Module: TextDataTools::Column

Defined in:
lib/text-data-tools.rb

Overview

Tools for extracting data from text files where the data appears in columns with or without headers for each column.

Defined Under Namespace

Classes: DataFile, NotFoundError

Class Method Summary collapse

Class Method Details

.column_index_from_headers(line, column_header, header_match) ⇒ Object

Raises:

  • (ArgumentError)


151
152
153
154
155
156
157
158
159
# File 'lib/text-data-tools.rb', line 151

def self.column_index_from_headers(line, column_header, header_match)
	headers = line.scan(header_match)
	#p headers
	index_array = headers.map{|head| head =~ (column_header.kind_of?(Regexp) ? column_header : Regexp.new(Regexp.escape(column_header)))}
	#p index_array
	raise ArgumentError.new("column_header: #{column_header.inspect} does not match any columns in #{headers.inspect}") if index_array.compact.size == 0
	raise ArgumentError.new("column_header: #{column_header.inspect} matches more than 1 column in #{headers.inspect}") if index_array.compact.size > 1
	column_header = index_array.index(index_array.compact[0])
end

.get_1d_array(filename, has_header_line, column_header, match = /\S+/, header_match = /\S+/, skip_blank = true) ⇒ Object

Return a one-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/text-data-tools.rb', line 34

def self.get_1d_array(filename, has_header_line, column_header, match=/\S+/, header_match=/\S+/, skip_blank=true)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		while line = file.gets
       next if line == "\n" and skip_blank
			values = line.scan(match)
		 	array.push values[column_header]	
			#puts line
		end
	end
	array
end

.get_1d_array_float(*args) ⇒ Object

Calls get_1d_array and converts all data elements to floats



55
56
57
# File 'lib/text-data-tools.rb', line 55

def self.get_1d_array_float(*args)
	get_1d_array(*args).map{|v| v.to_f}
end

.get_1d_array_integer(*args) ⇒ Object



58
59
60
# File 'lib/text-data-tools.rb', line 58

def self.get_1d_array_integer(*args)
	get_1d_array(*args).map{|v| v.to_i}
end

.get_2d_array(filename, has_header_line, column_header, index_header = nil, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a two-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

It is assumed that two-dimensional array is in one column. 
If index_header is nil, data is assumed to be separated by blank lines.
E.g.
		1.2
		4.2
		7.2

		8.2
		4.2
		2.2
If index_header is an integer or string or regexp, it selects a column
in the manner of column_header, and the data is divided by values of this
column.
E.g. 
		1  5.5
		1  3.2
		1  2.6
		2  3.2

2 2.2 2 6.3

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/text-data-tools.rb', line 93

def self.get_2d_array(filename, has_header_line, column_header, index_header=nil, match=/\S+/, header_match=/\S+/)
	raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
	raise ArgumentError.new("index_header should be a string, regex, integer or nil") unless [String, Regexp, Integer, NilClass].find{|cls| column_header.kind_of? cls}
	array = []
	File.open(filename) do |file|
		headers = file.gets if has_header_line
		if [String, Regexp].find{|cls| column_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			column_header = column_index_from_headers(headers, column_header, header_match)
		end
		if [String, Regexp].find{|cls| index_header.kind_of? cls}
			raise ("Header search given but has_header_line = false") if not has_header_line
			index_header = column_index_from_headers(headers, index_header, header_match)
		end
		index_value = false
		index = 0
		while line = file.gets
			if index_header.nil?
				if line =~ /^\s*$/
					if array.size == 0 # ignore empty lines at top
						next
					else
						(array.push []; index+=1;next) 
					end
				end
				array.push [] if array.size = 0
			else
				next if line =~ /^\s*$/
			end
			values = line.scan(match)
			if not index_header.nil?
				if array.size ==0
					array.push []
					index_value = values[index_header]
				elsif index_value != values[index_header]
					array.push []
					index+=1
					index_value = values[index_header]
				end
			end
		 	array[index].push values[column_header]	
			#puts line
		end
	end
	array
end

.get_2d_array_float(*args) ⇒ Object

Calls get_2d_array and converts all data elements to floats



141
142
143
# File 'lib/text-data-tools.rb', line 141

def self.get_2d_array_float(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_f}}
end

.get_2d_array_integer(*args) ⇒ Object



144
145
146
# File 'lib/text-data-tools.rb', line 144

def self.get_2d_array_integer(*args)
	get_2d_array(*args).map{|a| a.map{|v| v.to_i}}
end