Class: Rust::DataFrame

Inherits:

RustDatatype

Object
RustDatatype
Rust::DataFrame

show all

Defined in:: lib/rust/core/types/dataframe.rb

Class Method Summary collapse

Instance Method Summary collapse

#[](rows, cols = nil) ⇒ Object
#add_column(name, values = nil) ⇒ Object
#add_row(row) ⇒ Object (also: #<<)
#aggregate(by, **aggregators) ⇒ Object
#bind_columns(dataframe) ⇒ Object (also: #cbind)
#bind_columns!(dataframe) ⇒ Object (also: #cbind!)
#bind_rows(dataframe) ⇒ Object (also: #rbind)
#bind_rows!(dataframe) ⇒ Object (also: #rbind!)
#clone ⇒ Object
#column(name) ⇒ Object (also: #|)
#column_names ⇒ Object (also: #colnames)
#columns ⇒ Object
#delete_column(column) ⇒ Object
#delete_row(i) ⇒ Object
#each ⇒ Object
#each_with_index ⇒ Object
#fast_each ⇒ Object
#fast_each_with_index ⇒ Object
#fast_row(i) ⇒ Object
#has_row? ⇒ Boolean
#head(n = 10) ⇒ Object
#initialize(labels_or_data) ⇒ DataFrame constructor

A new instance of DataFrame.
#inspect ⇒ Object
#load_in_r_as(variable_name) ⇒ Object
#merge(other, by, first_alias = "x", second_alias = "y") ⇒ Object
#rename_column!(old_name, new_name) ⇒ Object
#row(i) ⇒ Object
#rows ⇒ Object
#select_columns(cols = nil) ⇒ Object (also: #select_cols)
#select_rows ⇒ Object
#shuffle(*args) ⇒ Object
#sort_by(column) ⇒ Object
#sort_by!(by) ⇒ Object
#transform_column!(column) ⇒ Object
#uniq_by(by) ⇒ Object
#uniq_by!(by) ⇒ Object

Methods inherited from RustDatatype

#r_hash, #r_mirror, #r_mirror_to

Constructor Details

#initialize(labels_or_data) ⇒ `DataFrame`

Returns a new instance of DataFrame.

# File 'lib/rust/core/types/dataframe.rb', line 22

def initialize(labels_or_data)
    @data = {}
    
    if labels_or_data.is_a? Array
        @labels = labels_or_data.map { |l| l.to_s }
        @labels.each { |label| @data[label] = [] }
    elsif labels_or_data.is_a? Hash
        @labels = labels_or_data.keys.map { |l| l.to_s }
        
        labels_or_data.each do |key, value|
            @data[key.to_s] = value.clone
        end
    end
end

Class Method Details

.can_pull?(type, klass) ⇒ `Boolean`

Returns:

(Boolean)



5
6
7

# File 'lib/rust/core/types/dataframe.rb', line 5

def self.can_pull?(type, klass)
    return [klass].flatten.include?("data.frame")
end

.pull_priority ⇒ `Object`



9
10
11

# File 'lib/rust/core/types/dataframe.rb', line 9

def self.pull_priority
    1
end

.pull_variable(variable, type, klass) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 13

def self.pull_variable(variable, type, klass)
    hash = {}
    colnames = Rust["colnames(#{variable})"]
    colnames.each do |col|
        hash[col] = Rust["#{variable}$\"#{col}\""]
    end
    return DataFrame.new(hash)
end

Instance Method Details

#[](rows, cols = nil) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 67

def [](rows, cols=nil)
    raise "You must specify either rows or columns to select" if !rows && !cols
    result = self
    if rows && (rows.is_a?(Range) || rows.is_a?(Array))
        result = result.select_rows { |row, i| rows.include?(i) }
    end
    
    if cols && cols.is_a?(Array)
        cols = cols.map { |c| c.to_s }
        result = result.select_columns(cols)
    end
    
    return result
end

#add_column(name, values = nil) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 204

def add_column(name, values=nil)
    raise "Column already exists" if @labels.include?(name)
    raise "Values or block required" if !values && !block_given?
    raise "Number of values not matching" if values && values.size != self.rows
    
    @labels << name
    if values
        @data[name] = values.clone
    else
        @data[name] = []
        self.each_with_index do |row, i|
            @data[name][i] = yield row
        end
    end
end

#add_row(row) ⇒ `Object` Also known as: <<

# File 'lib/rust/core/types/dataframe.rb', line 181

def add_row(row)
    if row.is_a?(Array)
        raise "Expected an array of size #{@data.size}" unless row.size == @data.size
        
        @labels.each_with_index do |label, i|
            @data[label] << row[i]
        end
        
        return true
    elsif row.is_a?(Hash)
        raise "Expected a hash with the following keys: #{@data.keys}" unless row.keys.map { |l| l.to_s }.sort == @data.keys.sort
        
        row.each do |key, value|
            @data[key.to_s] << value
        end
        
        return true
    else
        raise TypeError, "Expected an Array or a Hash"
    end
end

#aggregate(by, **aggregators) ⇒ `Object`

Raises:

(TypeError)

# File 'lib/rust/core/types/dataframe.rb', line 379

def aggregate(by, **aggregators)
    raise TypeError, "Expected a string" unless by.is_a?(String)
    raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
    raise "Expected a block for default aggregator" unless block_given?
    
    aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
    
    sorted = self.sort_by(by)
    
    current_value = nil
    partials = []
    partial = nil
    sorted.column(by).each_with_index do |value, index|
        if current_value != value
            current_value = value
            partials << partial if partial
            partial = Rust::DataFrame.new(self.column_names)
        end
        partial << sorted.fast_row(index)
    end
    partials << partial
    
    result = Rust::DataFrame.new(self.column_names)
    partials.each do |partial|
        aggregated_row = {}
        aggregated_row[by] = partial.column(by)[0]
        (self.column_names - [by]).each do |column|
            if aggregators[column]
                aggregated_row[column] = aggregators[column].call(partial.column(column))
            else
                aggregated_row[column] = yield partial.column(column)
            end
        end
        
        result << aggregated_row
    end
    
    return result
end

#bind_columns(dataframe) ⇒ `Object` Also known as: cbind

# File 'lib/rust/core/types/dataframe.rb', line 482

def bind_columns(dataframe)
    result = self.clone
    result.bind_columns!(dataframe)
    return result
end

#bind_columns!(dataframe) ⇒ `Object` Also known as: cbind!

Raises:

(TypeError)

# File 'lib/rust/core/types/dataframe.rb', line 462

def bind_columns!(dataframe)
    raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
    raise "The number of rows are not compatible" if self.rows != dataframe.rows
    raise "The dataset would override some columns" if (self.column_names & dataframe.column_names).size > 0
    
    dataframe.column_names.each do |column_name|
        self.add_column(column_name, dataframe.column(column_name))
    end
    
    return true
end

#bind_rows(dataframe) ⇒ `Object` Also known as: rbind

# File 'lib/rust/core/types/dataframe.rb', line 475

def bind_rows(dataframe)
    result = self.clone
    result.bind_rows!(dataframe)
    return result
end

#bind_rows!(dataframe) ⇒ `Object` Also known as: rbind!

Raises:

(TypeError)

# File 'lib/rust/core/types/dataframe.rb', line 450

def bind_rows!(dataframe)
    raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
    raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
    
    dataframe.each do |row|
        self << row
    end
    
    return true
end

#clone ⇒ `Object`



489
490
491

# File 'lib/rust/core/types/dataframe.rb', line 489

def clone
    DataFrame.new(@data)
end

#column(name) ⇒ `Object` Also known as: |



82
83
84

# File 'lib/rust/core/types/dataframe.rb', line 82

def column(name)
    return @data[name]
end

#column_names ⇒ `Object` Also known as: colnames



168
169
170

# File 'lib/rust/core/types/dataframe.rb', line 168

def column_names
    return @labels.map { |k| k.to_s }
end

#columns ⇒ `Object`



177
178
179

# File 'lib/rust/core/types/dataframe.rb', line 177

def columns
    @labels.size
end

#delete_column(column) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 129

def delete_column(column)
    @labels.delete(column)
    @data.delete(column)
end

#delete_row(i) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 134

def delete_row(i)
    @data.each do |label, column|
        column.delete_at(i)
    end
end

#each ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 220

def each
    self.each_with_index do |element, i|
        yield element
    end
    
    return self
end

#each_with_index ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 236

def each_with_index
    for i in 0...self.rows
        element = {}
        @labels.each do |label|
            element[label] = @data[label][i]
        end
        
        yield element, i
    end
    
    return self
end

#fast_each ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 228

def fast_each
    self.fast_each_with_index do |element, i|
        yield element
    end
    
    return self
end

#fast_each_with_index ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 249

def fast_each_with_index
    for i in 0...self.rows
        element = []
        @labels.each do |label|
            element << @data[label][i]
        end
        
        yield element, i
    end
    
    return self
end

#fast_row(i) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 45

def fast_row(i)
    if i < 0 || i >= self.rows
        return nil
    else
        return @labels.map { |label| @data[label][i] }
    end
end

#has_row? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/rust/core/types/dataframe.rb', line 107

def has_row?
    self.each_with_index do |row, i|
        return true if yield row, i
    end
    return false
end

#head(n = 10) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 305

def head(n=10)
    result = DataFrame.new(self.column_names)
    self.each_with_index do |row, i|
        result << row if i < n
    end
    return result
end

#inspect ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 284

def inspect
    separator = " | "
    col_widths = self.column_names.map { |colname| [colname, ([colname.length] + @data[colname].map { |e| e.inspect.length }).max] }.to_h
    col_widths[:rowscol] = (self.rows - 1).inspect.length + 3
    
    result = ""
    result << "-" * (col_widths.values.sum + ((col_widths.size - 1) * separator.length)) + "\n"
    result << (" " * col_widths[:rowscol]) + self.column_names.map { |colname| (" " * (col_widths[colname] - colname.length)) + colname }.join(separator) + "\n"
    result << "-" * (col_widths.values.sum + ((col_widths.size - 1) * separator.length)) + "\n"
    self.each_with_index do |row, i|
        index_part = "[" + (" " * (col_widths[:rowscol] - i.inspect.length - 3)) + "#{i}] "
        row_part   = row.map { |colname, value| (" " * (col_widths[colname] - value.inspect.length)) + value.inspect }.join(separator)
        
        result << index_part + row_part + "\n"
    end
    
    result << "-" * (col_widths.values.sum + ((col_widths.size - 1) * separator.length))
    
    return result
end

#load_in_r_as(variable_name) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 262

def load_in_r_as(variable_name)
    command = []
    
    command << "#{variable_name} <- data.frame()"
    row_index = 1
    self.each do |row|
        command << "#{variable_name}[#{row_index.to_R}, #{row.keys.to_R}] <- #{row.values.to_R}"
        
        row_index += 1
    end
    
    self.column_names.each do |name|
        column = self.column(name)
        
        if column.is_a?(Factor)
            command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
        end
    end
    
    Rust._eval_big(command)
end

#merge(other, by, first_alias = "x", second_alias = "y") ⇒ `Object`

Raises:

(TypeError)

# File 'lib/rust/core/types/dataframe.rb', line 313

def merge(other, by, first_alias = "x", second_alias = "y")
    raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
    raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
    raise "This dataset should have all the columns in #{by}" unless (by & self.column_names).size == by.size
    raise "The passed dataset should have all the columns in #{by}" unless (by & other.column_names).size == by.size
    
    if first_alias == second_alias
        if first_alias == ""
            my_columns = self.column_names - by
            other_columns = other.column_names - by
            intersection = my_columns & other_columns
            raise "Cannot merge because the following columns would overlap: #{intersection}" if intersection.size > 0
        else
            raise "The aliases can not have the same value"
        end
    end
    
    my_keys = {}
    self.each_with_index do |row, i|
        key = []
        by.each do |colname|
            key << row[colname]
        end
        
        my_keys[key] = i
    end
    
    merged_column_self  = (self.column_names - by)
    merged_column_other = (other.column_names - by)
    
    first_alias =  first_alias + "."     if first_alias.length > 0
    second_alias = second_alias + "."    if second_alias.length > 0
    
    merged_columns = merged_column_self.map { |colname| "#{first_alias}#{colname}" } + merged_column_other.map { |colname| "#{second_alias}#{colname}" }
    columns = by + merged_columns
    result = DataFrame.new(columns)
    other.each do |other_row|
        key = []
        by.each do |colname|
            key << other_row[colname]
        end
        
        my_row_index = my_keys[key]
        if my_row_index
            my_row = self.row(my_row_index)
            
            to_add = {}
            by.each do |colname|
                to_add[colname] = my_row[colname]
            end
            
            merged_column_self.each do |colname|
                to_add["#{first_alias}#{colname}"] = my_row[colname]
            end
            
            merged_column_other.each do |colname|
                to_add["#{second_alias}#{colname}"] = other_row[colname]
            end
            
            result << to_add
        end
    end
    
    return result
end

#rename_column!(old_name, new_name) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 87

def rename_column!(old_name, new_name)
    raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
    raise "This DataFrame already contains a column named #{new_name}" if @labels.include?(new_name)
    
    @data[new_name.to_s] = @data.delete(old_name)
    @labels[@labels.index(old_name)] = new_name
end

#row(i) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 37

def row(i)
    if i < 0 || i >= self.rows
        return nil
    else
        return @data.map { |label, values| [label, values[i]] }.to_h
    end
end

#rows ⇒ `Object`



173
174
175

# File 'lib/rust/core/types/dataframe.rb', line 173

def rows
    @data.values[0].size
end

#select_columns(cols = nil) ⇒ `Object` Also known as: select_cols

# File 'lib/rust/core/types/dataframe.rb', line 114

def select_columns(cols=nil)
    raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
    
    result = self.clone
    @labels.each do |label|
        if cols
            result.delete_column(label) unless cols.include?(label)
        else
            result.delete_column(label) unless yield label
        end
    end
    return result
end

#select_rows ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 99

def select_rows
    result = DataFrame.new(self.column_names)
    self.each_with_index do |row, i|
        result << row if yield row, i
    end
    return result
end

#shuffle(*args) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 53

def shuffle(*args)
    result = DataFrame.new(@labels)
    
    buffer = []
    self.each do |row|
        buffer << row
    end
    buffer.shuffle!(*args).each do |row|
        result << row
    end
    
    return result
end

#sort_by(column) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 419

def sort_by(column)
    result = self.clone
    result.sort_by!(column)
    return result
end

#sort_by!(by) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 425

def sort_by!(by)
    copy = @data[by].clone
    copy.sort!
    
    indices = []
    @data[by].each_with_index do |value, i|
        index = copy.index(value)
        indices << index
        
        copy[index] = NilClass
    end
                
    (self.column_names - [by]).each do |column_name|
        sorted = []
        column = self.column(column_name)
        column_i = 0
        indices.each do |i|
            sorted[i] = column[column_i]
            column_i += 1
        end
        @data[column_name] = sorted
    end
    @data[by].sort!
end

#transform_column!(column) ⇒ `Object`



95
96
97

# File 'lib/rust/core/types/dataframe.rb', line 95

def transform_column!(column)
    @data[column].map! { |e| yield e }
end

#uniq_by(by) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 140

def uniq_by(by)
    result = self.clone
    result.uniq_by!(by)
    return result
end

#uniq_by!(by) ⇒ `Object`

# File 'lib/rust/core/types/dataframe.rb', line 146

def uniq_by!(by)
    my_keys = {}
    to_delete = []
    self.each_with_index do |row, i|
        key = []
        by.each do |colname|
            key << row[colname]
        end
        unless my_keys[key]
            my_keys[key] = i
        else
            to_delete << (i-to_delete.size)
        end
    end
    
    to_delete.each do |i|
        self.delete_row(i)
    end
    
    return self
end

Class: Rust::DataFrame

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from RustDatatype

Constructor Details

#initialize(labels_or_data) ⇒ DataFrame

Class Method Details

.can_pull?(type, klass) ⇒ Boolean

.pull_priority ⇒ Object

.pull_variable(variable, type, klass) ⇒ Object

Instance Method Details

#[](rows, cols = nil) ⇒ Object

#add_column(name, values = nil) ⇒ Object

#add_row(row) ⇒ Object Also known as: <<

#aggregate(by, **aggregators) ⇒ Object

#bind_columns(dataframe) ⇒ Object Also known as: cbind

#bind_columns!(dataframe) ⇒ Object Also known as: cbind!

#bind_rows(dataframe) ⇒ Object Also known as: rbind

#bind_rows!(dataframe) ⇒ Object Also known as: rbind!

#clone ⇒ Object

#column(name) ⇒ Object Also known as: |

#column_names ⇒ Object Also known as: colnames

#columns ⇒ Object

#delete_column(column) ⇒ Object

#delete_row(i) ⇒ Object

#each ⇒ Object

#each_with_index ⇒ Object

#fast_each ⇒ Object

#fast_each_with_index ⇒ Object

#fast_row(i) ⇒ Object

#has_row? ⇒ Boolean

#head(n = 10) ⇒ Object

#inspect ⇒ Object

#load_in_r_as(variable_name) ⇒ Object

#merge(other, by, first_alias = "x", second_alias = "y") ⇒ Object

#rename_column!(old_name, new_name) ⇒ Object

#row(i) ⇒ Object

#rows ⇒ Object

#select_columns(cols = nil) ⇒ Object Also known as: select_cols

#select_rows ⇒ Object

#shuffle(*args) ⇒ Object

#sort_by(column) ⇒ Object

#sort_by!(by) ⇒ Object

#transform_column!(column) ⇒ Object

#uniq_by(by) ⇒ Object

#uniq_by!(by) ⇒ Object

#initialize(labels_or_data) ⇒ `DataFrame`

.can_pull?(type, klass) ⇒ `Boolean`

.pull_priority ⇒ `Object`

.pull_variable(variable, type, klass) ⇒ `Object`

#[](rows, cols = nil) ⇒ `Object`

#add_column(name, values = nil) ⇒ `Object`

#add_row(row) ⇒ `Object` Also known as: <<

#aggregate(by, **aggregators) ⇒ `Object`

#bind_columns(dataframe) ⇒ `Object` Also known as: cbind

#bind_columns!(dataframe) ⇒ `Object` Also known as: cbind!

#bind_rows(dataframe) ⇒ `Object` Also known as: rbind

#bind_rows!(dataframe) ⇒ `Object` Also known as: rbind!

#clone ⇒ `Object`

#column(name) ⇒ `Object` Also known as: |

#column_names ⇒ `Object` Also known as: colnames

#columns ⇒ `Object`

#delete_column(column) ⇒ `Object`

#delete_row(i) ⇒ `Object`

#each ⇒ `Object`

#each_with_index ⇒ `Object`

#fast_each ⇒ `Object`

#fast_each_with_index ⇒ `Object`

#fast_row(i) ⇒ `Object`

#has_row? ⇒ `Boolean`

#head(n = 10) ⇒ `Object`

#inspect ⇒ `Object`

#load_in_r_as(variable_name) ⇒ `Object`

#merge(other, by, first_alias = "x", second_alias = "y") ⇒ `Object`

#rename_column!(old_name, new_name) ⇒ `Object`

#row(i) ⇒ `Object`

#rows ⇒ `Object`

#select_columns(cols = nil) ⇒ `Object` Also known as: select_cols

#select_rows ⇒ `Object`

#shuffle(*args) ⇒ `Object`

#sort_by(column) ⇒ `Object`

#sort_by!(by) ⇒ `Object`

#transform_column!(column) ⇒ `Object`

#uniq_by(by) ⇒ `Object`

#uniq_by!(by) ⇒ `Object`