Module: TextDataTools::Column

Defined in:
lib/text-data-tools.rb

Overview

Tools for extracting data from text files where the data appears in columns with or without headers for each column.

Defined Under Namespace

Classes: DataFile, NotFoundError

Class Method Summary collapse

Class Method Details

.column_index_from_headers(line, column_header, header_match) ⇒ Object

Raises:

  • (ArgumentError)


151
152
153
154
155
156
157
158
159
# File 'lib/text-data-tools.rb', line 151

def self.column_index_from_headers(line, column_header, header_match)
  headers = line.scan(header_match)
  #p headers
  index_array = headers.map{|head| head =~ (column_header.kind_of?(Regexp) ? column_header : Regexp.new(Regexp.escape(column_header)))}
  #p index_array
  raise ArgumentError.new("column_header: #{column_header.inspect} does not match any columns in #{headers.inspect}") if index_array.compact.size == 0
  raise ArgumentError.new("column_header: #{column_header.inspect} matches more than 1 column in #{headers.inspect}") if index_array.compact.size > 1
  column_header = index_array.index(index_array.compact[0])
end

.get_1d_array(filename, has_header_line, column_header, match = /\S+/, header_match = /\S+/, skip_blank = true) ⇒ Object

Return a one-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/text-data-tools.rb', line 34

def self.get_1d_array(filename, has_header_line, column_header, match=/\S+/, header_match=/\S+/, skip_blank=true)
  raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
  array = []
  File.open(filename) do |file|
    headers = file.gets if has_header_line
    if [String, Regexp].find{|cls| column_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      column_header = column_index_from_headers(headers, column_header, header_match)
    end
    while line = file.gets
       next if line == "\n" and skip_blank
      values = line.scan(match)
      array.push values[column_header]  
      #puts line
    end
  end
  array
end

.get_1d_array_float(*args) ⇒ Object

Calls get_1d_array and converts all data elements to floats



55
56
57
# File 'lib/text-data-tools.rb', line 55

def self.get_1d_array_float(*args)
  get_1d_array(*args).map{|v| v.to_f}
end

.get_1d_array_integer(*args) ⇒ Object



58
59
60
# File 'lib/text-data-tools.rb', line 58

def self.get_1d_array_integer(*args)
  get_1d_array(*args).map{|v| v.to_i}
end

.get_2d_array(filename, has_header_line, column_header, index_header = nil, match = /\S+/, header_match = /\S+/) ⇒ Object

Return a two-dimensional array containing data from the file filename,

which may or may not have a line of column headers,
in the column column_header, where column_header maybe either a string
or a regex which matches the title of the column,  or an integer
giving the zero-based  column number.

It is assumed that two-dimensional array is in one column. 
If index_header is nil, data is assumed to be separated by blank lines.
E.g.
    1.2
    4.2
    7.2

    8.2
    4.2
    2.2
If index_header is an integer or string or regexp, it selects a column
in the manner of column_header, and the data is divided by values of this
column.
E.g. 
    1  5.5
    1  3.2
    1  2.6
    2  3.2

2 2.2 2 6.3

Match is a regexp that matches data items, and header_match is a regexp that
matches items in the headers.

All data is returned as strings

Raises:

  • (ArgumentError)


93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/text-data-tools.rb', line 93

def self.get_2d_array(filename, has_header_line, column_header, index_header=nil, match=/\S+/, header_match=/\S+/)
  raise ArgumentError.new("column_header header should be a string, regex or integer") unless [String, Regexp, Integer].find{|cls| column_header.kind_of? cls}
  raise ArgumentError.new("index_header should be a string, regex, integer or nil") unless [String, Regexp, Integer, NilClass].find{|cls| column_header.kind_of? cls}
  array = []
  File.open(filename) do |file|
    headers = file.gets if has_header_line
    if [String, Regexp].find{|cls| column_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      column_header = column_index_from_headers(headers, column_header, header_match)
    end
    if [String, Regexp].find{|cls| index_header.kind_of? cls}
      raise ("Header search given but has_header_line = false") if not has_header_line
      index_header = column_index_from_headers(headers, index_header, header_match)
    end
    index_value = false
    index = 0
    while line = file.gets
      if index_header.nil?
        if line =~ /^\s*$/
          if array.size == 0 # ignore empty lines at top
            next
          else
            (array.push []; index+=1;next) 
          end
        end
        array.push [] if array.size = 0
      else
        next if line =~ /^\s*$/
      end
      values = line.scan(match)
      if not index_header.nil?
        if array.size ==0
          array.push []
          index_value = values[index_header]
        elsif index_value != values[index_header]
          array.push []
          index+=1
          index_value = values[index_header]
        end
      end
      array[index].push values[column_header] 
      #puts line
    end
  end
  array
end

.get_2d_array_float(*args) ⇒ Object

Calls get_2d_array and converts all data elements to floats



141
142
143
# File 'lib/text-data-tools.rb', line 141

def self.get_2d_array_float(*args)
  get_2d_array(*args).map{|a| a.map{|v| v.to_f}}
end

.get_2d_array_integer(*args) ⇒ Object



144
145
146
# File 'lib/text-data-tools.rb', line 144

def self.get_2d_array_integer(*args)
  get_2d_array(*args).map{|a| a.map{|v| v.to_i}}
end