Class: Rover::DataFrame
- Inherits:
-
Object
- Object
- Rover::DataFrame
- Defined in:
- lib/rover/data_frame.rb
Instance Method Summary collapse
- #+(other) ⇒ Object
-
#==(other) ⇒ Object
don’t check types.
- #[](where) ⇒ Object
- #[]=(k, v) ⇒ Object
-
#any? ⇒ Boolean
should this check for columns as well?.
- #clear ⇒ Object
-
#concat(other) ⇒ Object
in-place, like Array#concat TODO make more performant.
- #delete(key) ⇒ Object
- #dup ⇒ Object
-
#each_row ⇒ Object
return each row as a hash.
-
#empty? ⇒ Boolean
should this check for columns as well?.
- #except(*keys) ⇒ Object
- #except!(*keys) ⇒ Object
- #first(n = nil) ⇒ Object
- #group(columns) ⇒ Object
- #head(n = 5) ⇒ Object
- #include?(key) ⇒ Boolean
-
#initialize(*args) ⇒ DataFrame
constructor
A new instance of DataFrame.
-
#inner_join(other, on: nil) ⇒ Object
see join for options.
-
#inspect ⇒ Object
(also: #to_s)
TODO handle long text better.
- #keys ⇒ Object (also: #names, #vector_names)
- #last(n = nil) ⇒ Object
-
#left_join(other, on: nil) ⇒ Object
see join for options.
- #merge(other) ⇒ Object
- #merge!(other) ⇒ Object
-
#one_hot(drop: false) ⇒ Object
TODO raise error when collision.
- #sample(*args, **kwargs) ⇒ Object
- #shape ⇒ Object
- #size ⇒ Object (also: #length, #count)
- #sort_by(&block) ⇒ Object
- #sort_by! ⇒ Object
- #tail(n = 5) ⇒ Object
- #to_a ⇒ Object
- #to_csv ⇒ Object
- #to_h ⇒ Object
-
#to_html ⇒ Object
for IRuby.
- #to_numo ⇒ Object
- #types ⇒ Object
-
#vectors ⇒ Object
dup to prevent direct modification of keys.
Constructor Details
#initialize(*args) ⇒ DataFrame
Returns a new instance of DataFrame.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/rover/data_frame.rb', line 3 def initialize(*args) data, = process_args(args) @vectors = {} types = [:types] || {} if data.is_a?(DataFrame) data.vectors.each do |k, v| @vectors[k] = v end elsif data.is_a?(Hash) data.to_h.each do |k, v| @vectors[k] = if v.respond_to?(:to_a) Vector.new(v, type: types[k]) else v end end # handle scalars size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1 @vectors.each_key do |k| @vectors[k] = to_vector(@vectors[k], size: size, type: types[k]) end elsif data.is_a?(Array) vectors = {} raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) } keys = data.flat_map(&:keys).uniq keys.each do |k| vectors[k] = [] end data.each do |d| keys.each do |k| vectors[k] << d[k] end end vectors.each do |k, v| @vectors[k] = to_vector(v, type: types[k]) end elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base)) result = data.connection.select_all(data.all.to_sql) result.columns.each_with_index do |k, i| @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k]) end else raise ArgumentError, "Cannot cast to data frame: #{data.class.name}" end # check keys @vectors.each_key do |k| check_key(k) end # check sizes sizes = @vectors.values.map(&:size).uniq if sizes.size > 1 raise ArgumentError, "Different sizes: #{sizes}" end end |
Instance Method Details
#+(other) ⇒ Object
312 313 314 |
# File 'lib/rover/data_frame.rb', line 312 def +(other) dup.concat(other) end |
#==(other) ⇒ Object
don’t check types
353 354 355 356 357 |
# File 'lib/rover/data_frame.rb', line 353 def ==(other) size == other.size && keys == other.keys && keys.all? { |k| self[k] == other[k] } end |
#[](where) ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/rover/data_frame.rb', line 64 def [](where) if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } ) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v[where] end DataFrame.new(new_vectors) elsif where.is_a?(Array) # multiple columns df = DataFrame.new where.each do |k| df[k] = @vectors[k] end df else # single column @vectors[where] end end |
#[]=(k, v) ⇒ Object
100 101 102 103 104 105 |
# File 'lib/rover/data_frame.rb', line 100 def []=(k, v) check_key(k) v = to_vector(v, size: size) raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size @vectors[k] = v end |
#any? ⇒ Boolean
should this check for columns as well?
114 115 116 |
# File 'lib/rover/data_frame.rb', line 114 def any? size > 0 end |
#clear ⇒ Object
123 124 125 |
# File 'lib/rover/data_frame.rb', line 123 def clear @vectors.clear end |
#concat(other) ⇒ Object
in-place, like Array#concat TODO make more performant
318 319 320 321 322 323 324 325 326 327 328 329 |
# File 'lib/rover/data_frame.rb', line 318 def concat(other) raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame) size = self.size vectors.each do |k, v| @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size)) end (other.vector_names - vector_names).each do |k| @vectors[k] = Vector.new([nil] * size + other[k].to_a) end self end |
#delete(key) ⇒ Object
137 138 139 |
# File 'lib/rover/data_frame.rb', line 137 def delete(key) @vectors.delete(key) end |
#dup ⇒ Object
304 305 306 307 308 309 310 |
# File 'lib/rover/data_frame.rb', line 304 def dup df = DataFrame.new @vectors.each do |k, v| df[k] = v end df end |
#each_row ⇒ Object
return each row as a hash
85 86 87 88 89 |
# File 'lib/rover/data_frame.rb', line 85 def each_row size.times do |i| yield @vectors.map { |k, v| [k, v[i]] }.to_h end end |
#empty? ⇒ Boolean
should this check for columns as well?
119 120 121 |
# File 'lib/rover/data_frame.rb', line 119 def empty? size == 0 end |
#except(*keys) ⇒ Object
141 142 143 |
# File 'lib/rover/data_frame.rb', line 141 def except(*keys) dup.except!(*keys) end |
#except!(*keys) ⇒ Object
145 146 147 148 149 150 |
# File 'lib/rover/data_frame.rb', line 145 def except!(*keys) keys.each do |key| delete(key) end self end |
#first(n = nil) ⇒ Object
164 165 166 167 168 169 170 |
# File 'lib/rover/data_frame.rb', line 164 def first(n = nil) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v.first(n) end DataFrame.new(new_vectors) end |
#group(columns) ⇒ Object
293 294 295 |
# File 'lib/rover/data_frame.rb', line 293 def group(columns) Group.new(self, columns) end |
#head(n = 5) ⇒ Object
156 157 158 |
# File 'lib/rover/data_frame.rb', line 156 def head(n = 5) first(n) end |
#include?(key) ⇒ Boolean
152 153 154 |
# File 'lib/rover/data_frame.rb', line 152 def include?(key) @vectors.include?(key) end |
#inner_join(other, on: nil) ⇒ Object
see join for options
343 344 345 |
# File 'lib/rover/data_frame.rb', line 343 def inner_join(other, on: nil) join(other, on: on, how: "inner") end |
#inspect ⇒ Object Also known as: to_s
TODO handle long text better
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
# File 'lib/rover/data_frame.rb', line 243 def inspect return "#<Rover::DataFrame>" if keys.empty? lines = [] line_start = 0 spaces = 2 @vectors.each do |k, v| v = v.first(5).to_a width = ([k] + v).map(&:to_s).map(&:size).max width = 3 if width < 3 if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120 line_start = lines.size lines << [] [size, 5].min.times do |i| lines << [] end lines << [] if size > 5 lines << [] end lines[line_start] << "%#{width}s" % k.to_s v.each_with_index do |v2, i| lines[line_start + 1 + i] << "%#{width}s" % v2.to_s end lines[line_start + 6] << "%#{width}s" % "..." if size > 5 end lines.pop lines.map { |l| l.join(" " * spaces) }.join("\n") end |
#keys ⇒ Object Also known as: names, vector_names
131 132 133 |
# File 'lib/rover/data_frame.rb', line 131 def keys @vectors.keys end |
#last(n = nil) ⇒ Object
172 173 174 175 176 177 178 |
# File 'lib/rover/data_frame.rb', line 172 def last(n = nil) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v.last(n) end DataFrame.new(new_vectors) end |
#left_join(other, on: nil) ⇒ Object
see join for options
348 349 350 |
# File 'lib/rover/data_frame.rb', line 348 def left_join(other, on: nil) join(other, on: on, how: "left") end |
#merge(other) ⇒ Object
331 332 333 |
# File 'lib/rover/data_frame.rb', line 331 def merge(other) dup.merge!(other) end |
#merge!(other) ⇒ Object
335 336 337 338 339 340 |
# File 'lib/rover/data_frame.rb', line 335 def merge!(other) other.vectors.each do |k, v| self[k] = v end self end |
#one_hot(drop: false) ⇒ Object
TODO raise error when collision
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# File 'lib/rover/data_frame.rb', line 207 def one_hot(drop: false) df = DataFrame.new vectors.each do |k, v| if v.to_numo.is_a?(Numo::RObject) df.merge!(v.one_hot(drop: drop, prefix: "#{k}_")) else df[k] = v end end df rescue ArgumentError => e if e. == "All elements must be strings" # better error message raise ArgumentError, "All elements must be numeric or strings" end raise e end |
#sample(*args, **kwargs) ⇒ Object
180 181 182 183 184 |
# File 'lib/rover/data_frame.rb', line 180 def sample(*args, **kwargs) # TODO make more efficient indexes = (0...size).to_a.sample(*args, **kwargs) self[indexes] end |
#shape ⇒ Object
127 128 129 |
# File 'lib/rover/data_frame.rb', line 127 def shape [size, @vectors.size] end |
#size ⇒ Object Also known as: length, count
107 108 109 |
# File 'lib/rover/data_frame.rb', line 107 def size @vectors.values.first&.size || 0 end |
#sort_by(&block) ⇒ Object
289 290 291 |
# File 'lib/rover/data_frame.rb', line 289 def sort_by(&block) dup.sort_by!(&block) end |
#sort_by! ⇒ Object
277 278 279 280 281 282 283 284 285 286 287 |
# File 'lib/rover/data_frame.rb', line 277 def sort_by! indexes = size.times.sort_by do |i| yield @vectors.map { |k, v| [k, v[i]] }.to_h end @vectors.each do |k, v| self[k] = v.to_numo.at(indexes) end self end |
#tail(n = 5) ⇒ Object
160 161 162 |
# File 'lib/rover/data_frame.rb', line 160 def tail(n = 5) last(n) end |
#to_a ⇒ Object
186 187 188 189 190 191 192 |
# File 'lib/rover/data_frame.rb', line 186 def to_a a = [] each_row do |row| a << row end a end |
#to_csv ⇒ Object
225 226 227 228 229 230 231 232 233 234 |
# File 'lib/rover/data_frame.rb', line 225 def to_csv require "csv" CSV.generate do |csv| csv << keys numo = vectors.values.map(&:to_numo) size.times do |i| csv << numo.map { |n| n[i] } end end end |
#to_h ⇒ Object
194 195 196 197 198 199 200 |
# File 'lib/rover/data_frame.rb', line 194 def to_h hsh = {} @vectors.each do |k, v| hsh[k] = v.to_a end hsh end |
#to_html ⇒ Object
for IRuby
237 238 239 240 |
# File 'lib/rover/data_frame.rb', line 237 def to_html require "iruby" IRuby::HTML.table(to_h) end |
#to_numo ⇒ Object
202 203 204 |
# File 'lib/rover/data_frame.rb', line 202 def to_numo Numo::NArray.column_stack(vectors.values.map(&:to_numo)) end |
#types ⇒ Object
96 97 98 |
# File 'lib/rover/data_frame.rb', line 96 def types @vectors.map { |k, v| [k, v.type] }.to_h end |
#vectors ⇒ Object
dup to prevent direct modification of keys
92 93 94 |
# File 'lib/rover/data_frame.rb', line 92 def vectors @vectors.dup end |