Class: Polars::DataFrame

Inherits:
Object
  • Object
show all
Includes:
Plot
Defined in:
lib/polars/data_frame.rb

Overview

Two-dimensional data structure representing data as a table with rows and columns.

Instance Method Summary collapse

Constructor Details

#initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false) ⇒ DataFrame

Create a new DataFrame.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/polars/data_frame.rb', line 21

def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
  schema ||= columns

  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
    raise ArgumentError, "Use read_database instead"
  end

  if data.nil?
    self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
  elsif data.is_a?(Hash)
    data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
    self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
  elsif data.is_a?(::Array)
    self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
  elsif data.is_a?(Series)
    self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
  else
    raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
  end
end

Instance Method Details

#!=(other) ⇒ DataFrame

Not equal.



447
448
449
# File 'lib/polars/data_frame.rb', line 447

def !=(other)
  _comp(other, "neq")
end

#%(other) ⇒ DataFrame

Returns the modulo.



530
531
532
533
534
535
536
537
# File 'lib/polars/data_frame.rb', line 530

def %(other)
  if other.is_a?(DataFrame)
    return _from_rbdf(_df.rem_df(other._df))
  end

  other = _prepare_other_arg(other)
  _from_rbdf(_df.rem(other._s))
end

#*(other) ⇒ DataFrame

Performs multiplication.



482
483
484
485
486
487
488
489
# File 'lib/polars/data_frame.rb', line 482

def *(other)
  if other.is_a?(DataFrame)
    return _from_rbdf(_df.mul_df(other._df))
  end

  other = _prepare_other_arg(other)
  _from_rbdf(_df.mul(other._s))
end

#+(other) ⇒ DataFrame

Performs addition.



506
507
508
509
510
511
512
513
# File 'lib/polars/data_frame.rb', line 506

def +(other)
  if other.is_a?(DataFrame)
    return _from_rbdf(_df.add_df(other._df))
  end

  other = _prepare_other_arg(other)
  _from_rbdf(_df.add(other._s))
end

#-(other) ⇒ DataFrame

Performs subtraction.



518
519
520
521
522
523
524
525
# File 'lib/polars/data_frame.rb', line 518

def -(other)
  if other.is_a?(DataFrame)
    return _from_rbdf(_df.sub_df(other._df))
  end

  other = _prepare_other_arg(other)
  _from_rbdf(_df.sub(other._s))
end

#/(other) ⇒ DataFrame

Performs division.



494
495
496
497
498
499
500
501
# File 'lib/polars/data_frame.rb', line 494

def /(other)
  if other.is_a?(DataFrame)
    return _from_rbdf(_df.div_df(other._df))
  end

  other = _prepare_other_arg(other)
  _from_rbdf(_df.div(other._s))
end

#<(other) ⇒ DataFrame

Less than.



461
462
463
# File 'lib/polars/data_frame.rb', line 461

def <(other)
  _comp(other, "lt")
end

#<=(other) ⇒ DataFrame

Less than or equal.



475
476
477
# File 'lib/polars/data_frame.rb', line 475

def <=(other)
  _comp(other, "lt_eq")
end

#==(other) ⇒ DataFrame

Equal.



440
441
442
# File 'lib/polars/data_frame.rb', line 440

def ==(other)
  _comp(other, "eq")
end

#>(other) ⇒ DataFrame

Greater than.



454
455
456
# File 'lib/polars/data_frame.rb', line 454

def >(other)
  _comp(other, "gt")
end

#>=(other) ⇒ DataFrame

Greater than or equal.



468
469
470
# File 'lib/polars/data_frame.rb', line 468

def >=(other)
  _comp(other, "gt_eq")
end

#[](*args) ⇒ Object

Returns subset of the DataFrame.

Raises:

  • (ArgumentError)


571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
# File 'lib/polars/data_frame.rb', line 571

def [](*args)
  if args.size == 2
    row_selection, col_selection = args

    # df[.., unknown]
    if row_selection.is_a?(Range)

      # multiple slices
      # df[.., ..]
      if col_selection.is_a?(Range)
        raise Todo
      end
    end

    # df[2, ..] (select row as df)
    if row_selection.is_a?(Integer)
      if col_selection.is_a?(::Array)
        df = self[0.., col_selection]
        return df.slice(row_selection, 1)
      end
      # df[2, "a"]
      if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
        return self[col_selection][row_selection]
      end
    end

    # column selection can be "a" and ["a", "b"]
    if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
      col_selection = [col_selection]
    end

    # df[.., 1]
    if col_selection.is_a?(Integer)
      series = to_series(col_selection)
      return series[row_selection]
    end

    if col_selection.is_a?(::Array)
      # df[.., [1, 2]]
      if Utils.is_int_sequence(col_selection)
        series_list = col_selection.map { |i| to_series(i) }
        df = self.class.new(series_list)
        return df[row_selection]
      end
    end

    df = self[col_selection]
    return df[row_selection]
  elsif args.size == 1
    item = args[0]

    # select single column
    # df["foo"]
    if item.is_a?(::String) || item.is_a?(Symbol)
      return Utils.wrap_s(_df.column(item.to_s))
    end

    # df[idx]
    if item.is_a?(Integer)
      return slice(_pos_idx(item, 0), 1)
    end

    # df[..]
    if item.is_a?(Range)
      return Slice.new(self).apply(item)
    end

    if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
      # select multiple columns
      # df[["foo", "bar"]]
      return _from_rbdf(_df.select(item.map(&:to_s)))
    end

    if Utils.is_int_sequence(item)
      item = Series.new("", item)
    end

    if item.is_a?(Series)
      dtype = item.dtype
      if dtype == String
        return _from_rbdf(_df.select(item))
      elsif dtype == UInt32
        return _from_rbdf(_df.take_with_series(item._s))
      elsif [UInt8, UInt16, UInt64, Int8, Int16, Int32, Int64].include?(dtype)
        return _from_rbdf(
          _df.take_with_series(_pos_idxs(item, 0)._s)
        )
      end
    end
  end

  # Ruby-specific
  if item.is_a?(Expr) || item.is_a?(Series)
    return filter(item)
  end

  raise ArgumentError, "Cannot get item of type: #{item.class.name}"
end

#[]=(*key, value) ⇒ Object

Set item.



673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
# File 'lib/polars/data_frame.rb', line 673

def []=(*key, value)
  if key.length == 1
    key = key.first
  elsif key.length != 2
    raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
  end

  if Utils.strlike?(key)
    if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
      value = Series.new(value)
    elsif !value.is_a?(Series)
      value = Polars.lit(value)
    end
    self._df = with_column(value.alias(key.to_s))._df
  elsif key.is_a?(::Array)
    row_selection, col_selection = key

    if Utils.strlike?(col_selection)
      s = self[col_selection]
    elsif col_selection.is_a?(Integer)
      raise Todo
    else
      raise ArgumentError, "column selection not understood: #{col_selection}"
    end

    s[row_selection] = value

    if col_selection.is_a?(Integer)
      replace_column(col_selection, s)
    elsif Utils.strlike?(col_selection)
      replace(col_selection, s)
    end
  else
    raise Todo
  end
end

#apply(return_dtype: nil, inference_size: 256, &f) ⇒ Object

Note:

The frame-level apply cannot track column names (as the UDF is a black-box that may arbitrarily drop, rearrange, transform, or add new columns); if you want to apply a UDF such that column names are preserved, you should use the expression-level apply syntax instead.

Apply a custom/user-defined function (UDF) over the rows of the DataFrame.

The UDF will receive each row as a tuple of values: udf(row).

Implementing logic using a Ruby function is almost always significantly slower and more memory intensive than implementing the same logic using the native expression API because:

  • The native expression engine runs in Rust; UDFs run in Ruby.
  • Use of Ruby UDFs forces the DataFrame to be materialized in memory.
  • Polars-native expressions can be parallelised (UDFs cannot).
  • Polars-native expressions can be logically optimised (UDFs cannot).

Wherever possible you should strongly prefer the native expression API to achieve the best performance.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})

Return a DataFrame by mapping each row to a tuple:

df.apply { |t| [t[0] * 2, t[1] * 3] }
# =>
# shape: (3, 2)
# ┌──────────┬──────────┐
# │ column_0 ┆ column_1 │
# │ ---      ┆ ---      │
# │ i64      ┆ i64      │
# ╞══════════╪══════════╡
# │ 2        ┆ -3       │
# │ 4        ┆ 15       │
# │ 6        ┆ 24       │
# └──────────┴──────────┘

Return a Series by mapping each row to a scalar:

df.apply { |t| t[0] * 2 + t[1] }
# =>
# shape: (3, 1)
# ┌───────┐
# │ apply │
# │ ---   │
# │ i64   │
# ╞═══════╡
# │ 1     │
# │ 9     │
# │ 14    │
# └───────┘


2591
2592
2593
2594
2595
2596
2597
2598
# File 'lib/polars/data_frame.rb', line 2591

def apply(return_dtype: nil, inference_size: 256, &f)
  out, is_df = _df.apply(f, return_dtype, inference_size)
  if is_df
    _from_rbdf(out)
  else
    _from_rbdf(Utils.wrap_s(out).to_frame._df)
  end
end

#clearedDataFrame

Create an empty copy of the current DataFrame.

Returns a DataFrame with identical schema but no data.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [nil, 2, 3, 4],
    "b" => [0.5, nil, 2.5, 13],
    "c" => [true, true, false, nil]
  }
)
df.cleared
# =>
# shape: (0, 3)
# ┌─────┬─────┬──────┐
# │ a   ┆ b   ┆ c    │
# │ --- ┆ --- ┆ ---  │
# │ i64 ┆ f64 ┆ bool │
# ╞═════╪═════╪══════╡
# └─────┴─────┴──────┘


2879
2880
2881
# File 'lib/polars/data_frame.rb', line 2879

def cleared
  height > 0 ? head(0) : clone
end

#columnsArray

Get column names.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.columns
# => ["foo", "bar", "ham"]


364
365
366
# File 'lib/polars/data_frame.rb', line 364

def columns
  _df.columns
end

#columns=(columns) ⇒ Object

Change the column names of the DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.columns = ["apple", "banana", "orange"]
df
# =>
# shape: (3, 3)
# ┌───────┬────────┬────────┐
# │ apple ┆ banana ┆ orange │
# │ ---   ┆ ---    ┆ ---    │
# │ i64   ┆ i64    ┆ str    │
# ╞═══════╪════════╪════════╡
# │ 1     ┆ 6      ┆ a      │
# │ 2     ┆ 7      ┆ b      │
# │ 3     ┆ 8      ┆ c      │
# └───────┴────────┴────────┘


397
398
399
# File 'lib/polars/data_frame.rb', line 397

def columns=(columns)
  _df.set_column_names(columns)
end

#delete(name) ⇒ Series

Drop in place if exists.



2852
2853
2854
# File 'lib/polars/data_frame.rb', line 2852

def delete(name)
  drop_in_place(name) if include?(name)
end

#describeDataFrame

Summary statistics for a DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1.0, 2.8, 3.0],
    "b" => [4, 5, nil],
    "c" => [true, false, true],
    "d" => [nil, "b", "c"],
    "e" => ["usd", "eur", nil]
  }
)
df.describe
# =>
# shape: (7, 6)
# ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
# │ describe   ┆ a        ┆ b        ┆ c        ┆ d    ┆ e    │
# │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  │
# │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  │
# ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
# │ count      ┆ 3.0      ┆ 3.0      ┆ 3.0      ┆ 3    ┆ 3    │
# │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    │
# │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null │
# │ std        ┆ 1.101514 ┆ 0.707107 ┆ 0.57735  ┆ null ┆ null │
# │ min        ┆ 1.0      ┆ 4.0      ┆ 0.0      ┆ b    ┆ eur  │
# │ max        ┆ 3.0      ┆ 5.0      ┆ 1.0      ┆ c    ┆ usd  │
# │ median     ┆ 2.8      ┆ 4.5      ┆ 1.0      ┆ null ┆ null │
# └────────────┴──────────┴──────────┴──────────┴──────┴──────┘


1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
# File 'lib/polars/data_frame.rb', line 1341

def describe
  describe_cast = lambda do |stat|
    columns = []
    self.columns.each_with_index do |s, i|
      if self[s].is_numeric || self[s].is_boolean
        columns << stat[0.., i].cast(:f64)
      else
        # for dates, strings, etc, we cast to string so that all
        # statistics can be shown
        columns << stat[0.., i].cast(:str)
      end
    end
    self.class.new(columns)
  end

  summary = _from_rbdf(
    Polars.concat(
      [
        describe_cast.(
          self.class.new(columns.to_h { |c| [c, [height]] })
        ),
        describe_cast.(null_count),
        describe_cast.(mean),
        describe_cast.(std),
        describe_cast.(min),
        describe_cast.(max),
        describe_cast.(median)
      ]
    )._df
  )
  summary.insert_column(
    0,
    Polars::Series.new(
      "describe",
      ["count", "null_count", "mean", "std", "min", "max", "median"],
    )
  )
  summary
end

#drop(columns) ⇒ DataFrame

Remove column from DataFrame and return as new.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df.drop("ham")
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ f64 │
# ╞═════╪═════╡
# │ 1   ┆ 6.0 │
# │ 2   ┆ 7.0 │
# │ 3   ┆ 8.0 │
# └─────┴─────┘


2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
# File 'lib/polars/data_frame.rb', line 2806

def drop(columns)
  if columns.is_a?(::Array)
    df = clone
    columns.each do |n|
      df._df.drop_in_place(n)
    end
    df
  else
    _from_rbdf(_df.drop(columns))
  end
end

#drop_in_place(name) ⇒ Series

Drop in place.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.drop_in_place("ham")
# =>
# shape: (3,)
# Series: 'ham' [str]
# [
#         "a"
#         "b"
#         "c"
# ]


2842
2843
2844
# File 'lib/polars/data_frame.rb', line 2842

def drop_in_place(name)
  Utils.wrap_s(_df.drop_in_place(name))
end

#drop_nulls(subset: nil) ⇒ DataFrame

Return a new DataFrame where the null values are dropped.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, nil, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.drop_nulls
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# │ 3   ┆ 8   ┆ c   │
# └─────┴─────┴─────┘


1722
1723
1724
1725
1726
1727
# File 'lib/polars/data_frame.rb', line 1722

def drop_nulls(subset: nil)
  if subset.is_a?(::String)
    subset = [subset]
  end
  _from_rbdf(_df.drop_nulls(subset))
end

#dtypesArray

Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df.dtypes
# => [Polars::Int64, Polars::Float64, Polars::String]


415
416
417
# File 'lib/polars/data_frame.rb', line 415

def dtypes
  _df.dtypes
end

#each(&block) ⇒ Object

Returns an enumerator.



564
565
566
# File 'lib/polars/data_frame.rb', line 564

def each(&block)
  get_columns.each(&block)
end

#each_row(named: true, buffer_size: 500, &block) ⇒ Object

Returns an iterator over the DataFrame of rows of Ruby-native values.



4573
4574
4575
# File 'lib/polars/data_frame.rb', line 4573

def each_row(named: true, buffer_size: 500, &block)
  iter_rows(named: named, buffer_size: buffer_size, &block)
end

#equals(other, null_equal: true) ⇒ Boolean Also known as: frame_equal

Check if DataFrame is equal to other.

Examples:

df1 = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df2 = Polars::DataFrame.new(
  {
    "foo" => [3, 2, 1],
    "bar" => [8.0, 7.0, 6.0],
    "ham" => ["c", "b", "a"]
  }
)
df1.equals(df1)
# => true
df1.equals(df2)
# => false


1534
1535
1536
# File 'lib/polars/data_frame.rb', line 1534

def equals(other, null_equal: true)
  _df.equals(other._df, null_equal)
end

#estimated_size(unit = "b") ⇒ Numeric

Return an estimation of the total (heap) allocated size of the DataFrame.

Estimated size is given in the specified unit (bytes by default).

This estimation is the sum of the size of its buffers, validity, including nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the sum of the sizes computed from this function. In particular, StructArray's size is an upper bound.

When an array is sliced, its allocated size remains constant because the buffer unchanged. However, this function will yield a smaller number. This is because this function returns the visible size of the buffer, not its total capacity.

FFI buffers are included in this estimation.

Examples:

df = Polars::DataFrame.new(
  {
    "x" => 1_000_000.times.to_a.reverse,
    "y" => 1_000_000.times.map { |v| v / 1000.0 },
    "z" => 1_000_000.times.map(&:to_s)
  },
  columns: {"x" => :u32, "y" => :f64, "z" => :str}
)
df.estimated_size
# => 25888898
df.estimated_size("mb")
# => 24.689577102661133


1088
1089
1090
1091
# File 'lib/polars/data_frame.rb', line 1088

def estimated_size(unit = "b")
  sz = _df.estimated_size
  Utils.scale_bytes(sz, to: unit)
end

#explode(columns) ⇒ DataFrame

Explode DataFrame to long format by exploding a column with Lists.

Examples:

df = Polars::DataFrame.new(
  {
    "letters" => ["a", "a", "b", "c"],
    "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
  }
)
df.explode("numbers")
# =>
# shape: (8, 2)
# ┌─────────┬─────────┐
# │ letters ┆ numbers │
# │ ---     ┆ ---     │
# │ str     ┆ i64     │
# ╞═════════╪═════════╡
# │ a       ┆ 1       │
# │ a       ┆ 2       │
# │ a       ┆ 3       │
# │ b       ┆ 4       │
# │ b       ┆ 5       │
# │ c       ┆ 6       │
# │ c       ┆ 7       │
# │ c       ┆ 8       │
# └─────────┴─────────┘


3068
3069
3070
# File 'lib/polars/data_frame.rb', line 3068

def explode(columns)
  lazy.explode(columns).collect(no_optimization: true)
end

#extend(other) ⇒ DataFrame

Extend the memory backed by this DataFrame with the values from other.

Different from vstack which adds the chunks from other to the chunks of this DataFrame extend appends the data from other to the underlying memory locations and thus may cause a reallocation.

If this does not cause a reallocation, the resulting data structure will not have any extra chunks and thus will yield faster queries.

Prefer extend over vstack when you want to do a query after a single append. For instance during online operations where you add n rows and rerun a query.

Prefer vstack over extend when you want to append many times before doing a query. For instance when you read in multiple files and when to store them in a single DataFrame. In the latter case, finish the sequence of vstack operations with a rechunk.

Examples:

df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
df1.extend(df2)
# =>
# shape: (6, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 4   │
# │ 2   ┆ 5   │
# │ 3   ┆ 6   │
# │ 10  ┆ 40  │
# │ 20  ┆ 50  │
# │ 30  ┆ 60  │
# └─────┴─────┘


2774
2775
2776
2777
# File 'lib/polars/data_frame.rb', line 2774

def extend(other)
  _df.extend(other._df)
  self
end

#fill_nan(fill_value) ⇒ DataFrame

Note:

Note that floating point NaNs (Not a Number) are not missing values! To replace missing values, use fill_null.

Fill floating point NaN values by an Expression evaluation.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1.5, 2, Float::NAN, 4],
    "b" => [0.5, 4, Float::NAN, 13]
  }
)
df.fill_nan(99)
# =>
# shape: (4, 2)
# ┌──────┬──────┐
# │ a    ┆ b    │
# │ ---  ┆ ---  │
# │ f64  ┆ f64  │
# ╞══════╪══════╡
# │ 1.5  ┆ 0.5  │
# │ 2.0  ┆ 4.0  │
# │ 99.0 ┆ 99.0 │
# │ 4.0  ┆ 13.0 │
# └──────┴──────┘


3033
3034
3035
# File 'lib/polars/data_frame.rb', line 3033

def fill_nan(fill_value)
  lazy.fill_nan(fill_value).collect(no_optimization: true)
end

#fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true) ⇒ DataFrame

Fill null values using the specified value or strategy.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, nil, 4],
    "b" => [0.5, 4, nil, 13]
  }
)
df.fill_null(99)
# =>
# shape: (4, 2)
# ┌─────┬──────┐
# │ a   ┆ b    │
# │ --- ┆ ---  │
# │ i64 ┆ f64  │
# ╞═════╪══════╡
# │ 1   ┆ 0.5  │
# │ 2   ┆ 4.0  │
# │ 99  ┆ 99.0 │
# │ 4   ┆ 13.0 │
# └─────┴──────┘
df.fill_null(strategy: "forward")
# =>
# shape: (4, 2)
# ┌─────┬──────┐
# │ a   ┆ b    │
# │ --- ┆ ---  │
# │ i64 ┆ f64  │
# ╞═════╪══════╡
# │ 1   ┆ 0.5  │
# │ 2   ┆ 4.0  │
# │ 2   ┆ 4.0  │
# │ 4   ┆ 13.0 │
# └─────┴──────┘
df.fill_null(strategy: "max")
# =>
# shape: (4, 2)
# ┌─────┬──────┐
# │ a   ┆ b    │
# │ --- ┆ ---  │
# │ i64 ┆ f64  │
# ╞═════╪══════╡
# │ 1   ┆ 0.5  │
# │ 2   ┆ 4.0  │
# │ 4   ┆ 13.0 │
# │ 4   ┆ 13.0 │
# └─────┴──────┘
df.fill_null(strategy: "zero")
# =>
# shape: (4, 2)
# ┌─────┬──────┐
# │ a   ┆ b    │
# │ --- ┆ ---  │
# │ i64 ┆ f64  │
# ╞═════╪══════╡
# │ 1   ┆ 0.5  │
# │ 2   ┆ 4.0  │
# │ 0   ┆ 0.0  │
# │ 4   ┆ 13.0 │
# └─────┴──────┘


2993
2994
2995
2996
2997
2998
2999
3000
# File 'lib/polars/data_frame.rb', line 2993

def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true)
  _from_rbdf(
    lazy
      .fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
      .collect(no_optimization: true)
      ._df
  )
end

#filter(predicate) ⇒ DataFrame

Filter the rows in the DataFrame based on a predicate expression.

Examples:

Filter on one condition:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.filter(Polars.col("foo") < 3)
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# │ 2   ┆ 7   ┆ b   │
# └─────┴─────┴─────┘

Filter on multiple conditions:

df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
# =>
# shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# └─────┴─────┴─────┘


1307
1308
1309
# File 'lib/polars/data_frame.rb', line 1307

def filter(predicate)
  lazy.filter(predicate).collect
end

#fold(&operation) ⇒ Series

Apply a horizontal reduction on a DataFrame.

This can be used to effectively determine aggregations on a row level, and can be applied to any DataType that can be supercasted (casted to a similar parent type).

An example of the supercast rules when applying an arithmetic operation on two DataTypes are for instance:

i8 + str = str f32 + i64 = f32 f32 + f64 = f64

Examples:

A horizontal sum operation:

df = Polars::DataFrame.new(
  {
    "a" => [2, 1, 3],
    "b" => [1, 2, 3],
    "c" => [1.0, 2.0, 3.0]
  }
)
df.fold { |s1, s2| s1 + s2 }
# =>
# shape: (3,)
# Series: 'a' [f64]
# [
#         4.0
#         5.0
#         9.0
# ]

A horizontal minimum operation:

df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
# =>
# shape: (3,)
# Series: 'a' [f64]
# [
#         1.0
#         1.0
#         3.0
# ]

A horizontal string concatenation:

df = Polars::DataFrame.new(
  {
    "a" => ["foo", "bar", 2],
    "b" => [1, 2, 3],
    "c" => [1.0, 2.0, 3.0]
  }
)
df.fold { |s1, s2| s1 + s2 }
# =>
# shape: (3,)
# Series: 'a' [str]
# [
#         "foo11.0"
#         "bar22.0"
#         null
# ]

A horizontal boolean or, similar to a row-wise .any():

df = Polars::DataFrame.new(
  {
    "a" => [false, false, true],
    "b" => [false, true, false]
  }
)
df.fold { |s1, s2| s1 | s2 }
# =>
# shape: (3,)
# Series: 'a' [bool]
# [
#         false
#         true
#         true
# ]


4382
4383
4384
4385
4386
4387
4388
4389
# File 'lib/polars/data_frame.rb', line 4382

def fold(&operation)
  acc = to_series(0)

  1.upto(width - 1) do |i|
    acc = operation.call(acc, to_series(i))
  end
  acc
end

#gather_every(n, offset = 0) ⇒ DataFrame Also known as: take_every

Take every nth row in the DataFrame and return as a new DataFrame.

Examples:

s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
s.gather_every(2)
# =>
# shape: (2, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 5   │
# │ 3   ┆ 7   │
# └─────┴─────┘


4610
4611
4612
# File 'lib/polars/data_frame.rb', line 4610

def gather_every(n, offset = 0)
  select(Utils.col("*").gather_every(n, offset))
end

#get_column(name) ⇒ Series

Get a single column as Series by name.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
df.get_column("foo")
# =>
# shape: (3,)
# Series: 'foo' [i64]
# [
#         1
#         2
#         3
# ]


2910
2911
2912
# File 'lib/polars/data_frame.rb', line 2910

def get_column(name)
  self[name]
end

#get_column_index(name) ⇒ Series Also known as: find_idx_by_name

Find the index of a column by name.

Examples:

df = Polars::DataFrame.new(
  {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
)
df.get_column_index("ham")
# => 2


1394
1395
1396
# File 'lib/polars/data_frame.rb', line 1394

def get_column_index(name)
  _df.get_column_index(name)
end

#get_columnsArray

Get the DataFrame as a Array of Series.



2888
2889
2890
# File 'lib/polars/data_frame.rb', line 2888

def get_columns
  _df.get_columns.map { |s| Utils.wrap_s(s) }
end

#group_by(by, maintain_order: false) ⇒ GroupBy Also known as: groupby, group

Start a group by operation.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => ["a", "b", "a", "b", "b", "c"],
    "b" => [1, 2, 3, 4, 5, 6],
    "c" => [6, 5, 4, 3, 2, 1]
  }
)
df.group_by("a").agg(Polars.col("b").sum).sort("a")
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ a   ┆ 4   │
# │ b   ┆ 11  │
# │ c   ┆ 6   │
# └─────┴─────┘


1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
# File 'lib/polars/data_frame.rb', line 1832

def group_by(by, maintain_order: false)
  if !Utils.bool?(maintain_order)
    raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
  end
  GroupBy.new(
    self,
    by,
    maintain_order: maintain_order
  )
end

#group_by_dynamic(index_column, every:, period: nil, offset: nil, truncate: true, include_boundaries: false, closed: "left", by: nil, start_by: "window") ⇒ DataFrame Also known as: groupby_dynamic

Group based on a time value (or index value of type :i32, :i64).

Time windows are calculated and rows are assigned to windows. Different from a normal group by is that a row can be member of multiple groups. The time/index window could be seen as a rolling window, with a window size determined by dates/times/values instead of slots in the DataFrame.

A window is defined by:

  • every: interval of the window
  • period: length of the window
  • offset: offset of the window

The every, period and offset arguments are created with the following string language:

  • 1ns (1 nanosecond)
  • 1us (1 microsecond)
  • 1ms (1 millisecond)
  • 1s (1 second)
  • 1m (1 minute)
  • 1h (1 hour)
  • 1d (1 day)
  • 1w (1 week)
  • 1mo (1 calendar month)
  • 1y (1 calendar year)
  • 1i (1 index count)

Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds

In case of a group_by_dynamic on an integer column, the windows are defined by:

  • "1i" # length 1
  • "10i" # length 10

Examples:

df = Polars::DataFrame.new(
  {
    "time" => Polars.date_range(
      DateTime.new(2021, 12, 16),
      DateTime.new(2021, 12, 16, 3),
      "30m"
    ),
    "n" => 0..6
  }
)
# =>
# shape: (7, 2)
# ┌─────────────────────┬─────┐
# │ time                ┆ n   │
# │ ---                 ┆ --- │
# │ datetime[μs]        ┆ i64 │
# ╞═════════════════════╪═════╡
# │ 2021-12-16 00:00:00 ┆ 0   │
# │ 2021-12-16 00:30:00 ┆ 1   │
# │ 2021-12-16 01:00:00 ┆ 2   │
# │ 2021-12-16 01:30:00 ┆ 3   │
# │ 2021-12-16 02:00:00 ┆ 4   │
# │ 2021-12-16 02:30:00 ┆ 5   │
# │ 2021-12-16 03:00:00 ┆ 6   │
# └─────────────────────┴─────┘

Group by windows of 1 hour starting at 2021-12-16 00:00:00.

df.group_by_dynamic("time", every: "1h", closed: "right").agg(
  [
    Polars.col("time").min.alias("time_min"),
    Polars.col("time").max.alias("time_max")
  ]
)
# =>
# shape: (4, 3)
# ┌─────────────────────┬─────────────────────┬─────────────────────┐
# │ time                ┆ time_min            ┆ time_max            │
# │ ---                 ┆ ---                 ┆ ---                 │
# │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        │
# ╞═════════════════════╪═════════════════════╪═════════════════════╡
# │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
# │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
# │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
# │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
# └─────────────────────┴─────────────────────┴─────────────────────┘

The window boundaries can also be added to the aggregation result.

df.group_by_dynamic(
  "time", every: "1h", include_boundaries: true, closed: "right"
).agg([Polars.col("time").count.alias("time_count")])
# =>
# shape: (4, 4)
# ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
# │ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ time_count │
# │ ---                 ┆ ---                 ┆ ---                 ┆ ---        │
# │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ u32        │
# ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
# │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1          │
# │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2          │
# │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2          │
# │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2          │
# └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘

When closed="left", should not include right end of interval.

df.group_by_dynamic("time", every: "1h", closed: "left").agg(
  [
    Polars.col("time").count.alias("time_count"),
    Polars.col("time").alias("time_agg_list")
  ]
)
# =>
# shape: (4, 3)
# ┌─────────────────────┬────────────┬───────────────────────────────────┐
# │ time                ┆ time_count ┆ time_agg_list                     │
# │ ---                 ┆ ---        ┆ ---                               │
# │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                │
# ╞═════════════════════╪════════════╪═══════════════════════════════════╡
# │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16… │
# │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16… │
# │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16… │
# │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]             │
# └─────────────────────┴────────────┴───────────────────────────────────┘

When closed="both" the time values at the window boundaries belong to 2 groups.

df.group_by_dynamic("time", every: "1h", closed: "both").agg(
  [Polars.col("time").count.alias("time_count")]
)
# =>
# shape: (5, 2)
# ┌─────────────────────┬────────────┐
# │ time                ┆ time_count │
# │ ---                 ┆ ---        │
# │ datetime[μs]        ┆ u32        │
# ╞═════════════════════╪════════════╡
# │ 2021-12-15 23:00:00 ┆ 1          │
# │ 2021-12-16 00:00:00 ┆ 3          │
# │ 2021-12-16 01:00:00 ┆ 3          │
# │ 2021-12-16 02:00:00 ┆ 3          │
# │ 2021-12-16 03:00:00 ┆ 1          │
# └─────────────────────┴────────────┘

Dynamic group bys can also be combined with grouping on normal keys.

df = Polars::DataFrame.new(
  {
    "time" => Polars.date_range(
      DateTime.new(2021, 12, 16),
      DateTime.new(2021, 12, 16, 3),
      "30m"
    ),
    "groups" => ["a", "a", "a", "b", "b", "a", "a"]
  }
)
df.group_by_dynamic(
  "time",
  every: "1h",
  closed: "both",
  by: "groups",
  include_boundaries: true
).agg([Polars.col("time").count.alias("time_count")])
# =>
# shape: (7, 5)
# ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
# │ groups ┆ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ time_count │
# │ ---    ┆ ---                 ┆ ---                 ┆ ---                 ┆ ---        │
# │ str    ┆ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ u32        │
# ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
# │ a      ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1          │
# │ a      ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3          │
# │ a      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1          │
# │ a      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2          │
# │ a      ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1          │
# │ b      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2          │
# │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1          │
# └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘

Dynamic group by on an index column.

df = Polars::DataFrame.new(
  {
    "idx" => Polars.arange(0, 6, eager: true),
    "A" => ["A", "A", "B", "B", "B", "C"]
  }
)
df.group_by_dynamic(
  "idx",
  every: "2i",
  period: "3i",
  include_boundaries: true,
  closed: "right"
).agg(Polars.col("A").alias("A_agg_list"))
# =>
# shape: (3, 4)
# ┌─────────────────┬─────────────────┬─────┬─────────────────┐
# │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list      │
# │ ---             ┆ ---             ┆ --- ┆ ---             │
# │ i64             ┆ i64             ┆ i64 ┆ list[str]       │
# ╞═════════════════╪═════════════════╪═════╪═════════════════╡
# │ 0               ┆ 3               ┆ 0   ┆ ["A", "B", "B"] │
# │ 2               ┆ 5               ┆ 2   ┆ ["B", "B", "C"] │
# │ 4               ┆ 7               ┆ 4   ┆ ["C"]           │
# └─────────────────┴─────────────────┴─────┴─────────────────┘


2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
# File 'lib/polars/data_frame.rb', line 2173

def group_by_dynamic(
  index_column,
  every:,
  period: nil,
  offset: nil,
  truncate: true,
  include_boundaries: false,
  closed: "left",
  by: nil,
  start_by: "window"
)
  DynamicGroupBy.new(
    self,
    index_column,
    every,
    period,
    offset,
    truncate,
    include_boundaries,
    closed,
    by,
    start_by
  )
end

#group_by_rolling(index_column:, period:, offset: nil, closed: "right", by: nil, check_sorted: true) ⇒ RollingGroupBy Also known as: groupby_rolling

Create rolling groups based on a time column.

Also works for index values of type :i32 or :i64.

Different from a dynamic_group_by the windows are now determined by the individual values and are not of constant intervals. For constant intervals use group_by_dynamic

The period and offset arguments are created either from a timedelta, or by using the following string language:

  • 1ns (1 nanosecond)
  • 1us (1 microsecond)
  • 1ms (1 millisecond)
  • 1s (1 second)
  • 1m (1 minute)
  • 1h (1 hour)
  • 1d (1 day)
  • 1w (1 week)
  • 1mo (1 calendar month)
  • 1y (1 calendar year)
  • 1i (1 index count)

Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds

In case of a group_by_rolling on an integer column, the windows are defined by:

  • "1i" # length 1
  • "10i" # length 10

Examples:

dates = [
  "2020-01-01 13:45:48",
  "2020-01-01 16:42:13",
  "2020-01-01 16:45:09",
  "2020-01-02 18:12:48",
  "2020-01-03 19:45:32",
  "2020-01-08 23:16:43"
]
df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
  Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
)
df.group_by_rolling(index_column: "dt", period: "2d").agg(
  [
    Polars.sum("a").alias("sum_a"),
    Polars.min("a").alias("min_a"),
    Polars.max("a").alias("max_a")
  ]
)
# =>
# shape: (6, 4)
# ┌─────────────────────┬───────┬───────┬───────┐
# │ dt                  ┆ sum_a ┆ min_a ┆ max_a │
# │ ---                 ┆ ---   ┆ ---   ┆ ---   │
# │ datetime[μs]        ┆ i64   ┆ i64   ┆ i64   │
# ╞═════════════════════╪═══════╪═══════╪═══════╡
# │ 2020-01-01 13:45:48 ┆ 3     ┆ 3     ┆ 3     │
# │ 2020-01-01 16:42:13 ┆ 10    ┆ 3     ┆ 7     │
# │ 2020-01-01 16:45:09 ┆ 15    ┆ 3     ┆ 7     │
# │ 2020-01-02 18:12:48 ┆ 24    ┆ 3     ┆ 9     │
# │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
# │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
# └─────────────────────┴───────┴───────┴───────┘


1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
# File 'lib/polars/data_frame.rb', line 1935

def group_by_rolling(
  index_column:,
  period:,
  offset: nil,
  closed: "right",
  by: nil,
  check_sorted: true
)
  RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
end

#hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil) ⇒ Series

Hash and combine the rows in this DataFrame.

The hash value is of type :u64.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, nil, 3, 4],
    "ham" => ["a", "b", nil, "d"]
  }
)
df.hash_rows(seed: 42)
# =>
# shape: (4,)
# Series: '' [u64]
# [
#         4238614331852490969
#         17976148875586754089
#         4702262519505526977
#         18144177983981041107
# ]


4647
4648
4649
4650
4651
4652
4653
# File 'lib/polars/data_frame.rb', line 4647

def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
  k0 = seed
  k1 = seed_1.nil? ? seed : seed_1
  k2 = seed_2.nil? ? seed : seed_2
  k3 = seed_3.nil? ? seed : seed_3
  Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
end

#head(n = 5) ⇒ DataFrame

Get the first n rows.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3, 4, 5],
    "bar" => [6, 7, 8, 9, 10],
    "ham" => ["a", "b", "c", "d", "e"]
  }
)
df.head(3)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# │ 2   ┆ 7   ┆ b   │
# │ 3   ┆ 8   ┆ c   │
# └─────┴─────┴─────┘


1661
1662
1663
# File 'lib/polars/data_frame.rb', line 1661

def head(n = 5)
  _from_rbdf(_df.head(n))
end

#heightInteger Also known as: count, length, size

Get the height of the DataFrame.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
df.height
# => 5


331
332
333
# File 'lib/polars/data_frame.rb', line 331

def height
  _df.height
end

#hstack(columns, in_place: false) ⇒ DataFrame

Return a new DataFrame grown horizontally by stacking multiple Series to it.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
x = Polars::Series.new("apple", [10, 20, 30])
df.hstack([x])
# =>
# shape: (3, 4)
# ┌─────┬─────┬─────┬───────┐
# │ foo ┆ bar ┆ ham ┆ apple │
# │ --- ┆ --- ┆ --- ┆ ---   │
# │ i64 ┆ i64 ┆ str ┆ i64   │
# ╞═════╪═════╪═════╪═══════╡
# │ 1   ┆ 6   ┆ a   ┆ 10    │
# │ 2   ┆ 7   ┆ b   ┆ 20    │
# │ 3   ┆ 8   ┆ c   ┆ 30    │
# └─────┴─────┴─────┴───────┘


2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
# File 'lib/polars/data_frame.rb', line 2676

def hstack(columns, in_place: false)
  if !columns.is_a?(::Array)
    columns = columns.get_columns
  end
  if in_place
    _df.hstack_mut(columns.map(&:_s))
    self
  else
    _from_rbdf(_df.hstack(columns.map(&:_s)))
  end
end

#include?(name) ⇒ Boolean

Check if DataFrame includes column.



557
558
559
# File 'lib/polars/data_frame.rb', line 557

def include?(name)
  columns.include?(name)
end

#insert_column(index, series) ⇒ DataFrame Also known as: insert_at_idx

Insert a Series at a certain column index. This operation is in place.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
s = Polars::Series.new("baz", [97, 98, 99])
df.insert_column(1, s)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ baz ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 97  ┆ 4   │
# │ 2   ┆ 98  ┆ 5   │
# │ 3   ┆ 99  ┆ 6   │
# └─────┴─────┴─────┘
df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 4],
    "b" => [0.5, 4, 10, 13],
    "c" => [true, true, false, true]
  }
)
s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
df.insert_column(3, s)
# =>
# shape: (4, 4)
# ┌─────┬──────┬───────┬──────┐
# │ a   ┆ b    ┆ c     ┆ d    │
# │ --- ┆ ---  ┆ ---   ┆ ---  │
# │ i64 ┆ f64  ┆ bool  ┆ f64  │
# ╞═════╪══════╪═══════╪══════╡
# │ 1   ┆ 0.5  ┆ true  ┆ -2.5 │
# │ 2   ┆ 4.0  ┆ true  ┆ 15.0 │
# │ 3   ┆ 10.0 ┆ false ┆ 20.5 │
# │ 4   ┆ 13.0 ┆ true  ┆ 0.0  │
# └─────┴──────┴───────┴──────┘


1260
1261
1262
1263
1264
1265
1266
# File 'lib/polars/data_frame.rb', line 1260

def insert_column(index, series)
  if index < 0
    index = columns.length + index
  end
  _df.insert_column(index, series._s)
  self
end

#interpolateDataFrame

Interpolate intermediate values. The interpolation method is linear.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, nil, 9, 10],
    "bar" => [6, 7, 9, nil],
    "baz" => [1, nil, nil, 9]
  }
)
df.interpolate
# =>
# shape: (4, 3)
# ┌──────┬──────┬──────────┐
# │ foo  ┆ bar  ┆ baz      │
# │ ---  ┆ ---  ┆ ---      │
# │ f64  ┆ f64  ┆ f64      │
# ╞══════╪══════╪══════════╡
# │ 1.0  ┆ 6.0  ┆ 1.0      │
# │ 5.0  ┆ 7.0  ┆ 3.666667 │
# │ 9.0  ┆ 9.0  ┆ 6.333333 │
# │ 10.0 ┆ null ┆ 9.0      │
# └──────┴──────┴──────────┘


4680
4681
4682
# File 'lib/polars/data_frame.rb', line 4680

def interpolate
  select(Utils.col("*").interpolate)
end

#is_duplicatedSeries

Get a mask of all duplicated rows in this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 1],
    "b" => ["x", "y", "z", "x"],
  }
)
df.is_duplicated
# =>
# shape: (4,)
# Series: '' [bool]
# [
#         true
#         false
#         false
#         true
# ]


3556
3557
3558
# File 'lib/polars/data_frame.rb', line 3556

def is_duplicated
  Utils.wrap_s(_df.is_duplicated)
end

#is_emptyBoolean Also known as: empty?

Check if the dataframe is empty.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
df.is_empty
# => false
df.filter(Polars.col("foo") > 99).is_empty
# => true


4694
4695
4696
# File 'lib/polars/data_frame.rb', line 4694

def is_empty
  height == 0
end

#is_uniqueSeries

Get a mask of all unique rows in this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 1],
    "b" => ["x", "y", "z", "x"]
  }
)
df.is_unique
# =>
# shape: (4,)
# Series: '' [bool]
# [
#         false
#         true
#         true
#         false
# ]


3581
3582
3583
# File 'lib/polars/data_frame.rb', line 3581

def is_unique
  Utils.wrap_s(_df.is_unique)
end

#itemObject

Return the dataframe as a scalar.

Equivalent to df[0,0], with a check that the shape is (1,1).

Examples:

df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 5, 6]})
result = df.select((Polars.col("a") * Polars.col("b")).sum)
result.item
# => 32


721
722
723
724
725
726
# File 'lib/polars/data_frame.rb', line 721

def item
  if shape != [1, 1]
    raise ArgumentError, "Can only call .item if the dataframe is of shape (1,1), dataframe is of shape #{shape}"
  end
  self[0, 0]
end

#iter_rows(named: false, buffer_size: 500, &block) ⇒ Object

Returns an iterator over the DataFrame of rows of Ruby-native values.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 3, 5],
    "b" => [2, 4, 6]
  }
)
df.iter_rows.map { |row| row[0] }
# => [1, 3, 5]
df.iter_rows(named: true).map { |row| row["b"] }
# => [2, 4, 6]


4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
# File 'lib/polars/data_frame.rb', line 4526

def iter_rows(named: false, buffer_size: 500, &block)
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?

  # load into the local namespace for a modest performance boost in the hot loops
  columns = columns()

  # note: buffering rows results in a 2-4x speedup over individual calls
  # to ".row(i)", so it should only be disabled in extremely specific cases.
  if buffer_size
    offset = 0
    while offset < height
      zerocopy_slice = slice(offset, buffer_size)
      rows_chunk = zerocopy_slice.rows(named: false)
      if named
        rows_chunk.each do |row|
          yield columns.zip(row).to_h
        end
      else
        rows_chunk.each(&block)
      end
      offset += buffer_size
    end
  elsif named
    height.times do |i|
      yield columns.zip(row(i)).to_h
    end
  else
    height.times do |i|
      yield row(i)
    end
  end
end

#join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right") ⇒ DataFrame

Join in SQL-like fashion.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
other_df = Polars::DataFrame.new(
  {
    "apple" => ["x", "y", "z"],
    "ham" => ["a", "b", "d"]
  }
)
df.join(other_df, on: "ham")
# =>
# shape: (2, 4)
# ┌─────┬─────┬─────┬───────┐
# │ foo ┆ bar ┆ ham ┆ apple │
# │ --- ┆ --- ┆ --- ┆ ---   │
# │ i64 ┆ f64 ┆ str ┆ str   │
# ╞═════╪═════╪═════╪═══════╡
# │ 1   ┆ 6.0 ┆ a   ┆ x     │
# │ 2   ┆ 7.0 ┆ b   ┆ y     │
# └─────┴─────┴─────┴───────┘
df.join(other_df, on: "ham", how: "outer")
# =>
# shape: (4, 5)
# ┌──────┬──────┬──────┬───────┬───────────┐
# │ foo  ┆ bar  ┆ ham  ┆ apple ┆ ham_right │
# │ ---  ┆ ---  ┆ ---  ┆ ---   ┆ ---       │
# │ i64  ┆ f64  ┆ str  ┆ str   ┆ str       │
# ╞══════╪══════╪══════╪═══════╪═══════════╡
# │ 1    ┆ 6.0  ┆ a    ┆ x     ┆ a         │
# │ 2    ┆ 7.0  ┆ b    ┆ y     ┆ b         │
# │ null ┆ null ┆ null ┆ z     ┆ d         │
# │ 3    ┆ 8.0  ┆ c    ┆ null  ┆ null      │
# └──────┴──────┴──────┴───────┴───────────┘
df.join(other_df, on: "ham", how: "left")
# =>
# shape: (3, 4)
# ┌─────┬─────┬─────┬───────┐
# │ foo ┆ bar ┆ ham ┆ apple │
# │ --- ┆ --- ┆ --- ┆ ---   │
# │ i64 ┆ f64 ┆ str ┆ str   │
# ╞═════╪═════╪═════╪═══════╡
# │ 1   ┆ 6.0 ┆ a   ┆ x     │
# │ 2   ┆ 7.0 ┆ b   ┆ y     │
# │ 3   ┆ 8.0 ┆ c   ┆ null  │
# └─────┴─────┴─────┴───────┘
df.join(other_df, on: "ham", how: "semi")
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ f64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6.0 ┆ a   │
# │ 2   ┆ 7.0 ┆ b   │
# └─────┴─────┴─────┘
df.join(other_df, on: "ham", how: "anti")
# =>
# shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ f64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8.0 ┆ c   │
# └─────┴─────┴─────┘


2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
# File 'lib/polars/data_frame.rb', line 2518

def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
  lazy
    .join(
      other.lazy,
      left_on: left_on,
      right_on: right_on,
      on: on,
      how: how,
      suffix: suffix,
    )
    .collect(no_optimization: true)
end

#join_asof(other, left_on: nil, right_on: nil, on: nil, by_left: nil, by_right: nil, by: nil, strategy: "backward", suffix: "_right", tolerance: nil, allow_parallel: true, force_parallel: false) ⇒ DataFrame

Perform an asof join.

This is similar to a left-join except that we match on nearest key rather than equal keys.

Both DataFrames must be sorted by the asof_join key.

For each row in the left DataFrame:

  • A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
  • A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.

The default is "backward".

Examples:

gdp = Polars::DataFrame.new(
  {
    "date" => [
      DateTime.new(2016, 1, 1),
      DateTime.new(2017, 1, 1),
      DateTime.new(2018, 1, 1),
      DateTime.new(2019, 1, 1),
    ],  # note record date: Jan 1st (sorted!)
    "gdp" => [4164, 4411, 4566, 4696]
  }
).set_sorted("date")
population = Polars::DataFrame.new(
  {
    "date" => [
      DateTime.new(2016, 5, 12),
      DateTime.new(2017, 5, 12),
      DateTime.new(2018, 5, 12),
      DateTime.new(2019, 5, 12),
    ],  # note record date: May 12th (sorted!)
    "population" => [82.19, 82.66, 83.12, 83.52]
  }
).set_sorted("date")
population.join_asof(
  gdp, left_on: "date", right_on: "date", strategy: "backward"
)
# =>
# shape: (4, 3)
# ┌─────────────────────┬────────────┬──────┐
# │ date                ┆ population ┆ gdp  │
# │ ---                 ┆ ---        ┆ ---  │
# │ datetime[ns]        ┆ f64        ┆ i64  │
# ╞═════════════════════╪════════════╪══════╡
# │ 2016-05-12 00:00:00 ┆ 82.19      ┆ 4164 │
# │ 2017-05-12 00:00:00 ┆ 82.66      ┆ 4411 │
# │ 2018-05-12 00:00:00 ┆ 83.12      ┆ 4566 │
# │ 2019-05-12 00:00:00 ┆ 83.52      ┆ 4696 │
# └─────────────────────┴────────────┴──────┘


2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
# File 'lib/polars/data_frame.rb', line 2390

def join_asof(
  other,
  left_on: nil,
  right_on: nil,
  on: nil,
  by_left: nil,
  by_right: nil,
  by: nil,
  strategy: "backward",
  suffix: "_right",
  tolerance: nil,
  allow_parallel: true,
  force_parallel: false
)
  lazy
    .join_asof(
      other.lazy,
      left_on: left_on,
      right_on: right_on,
      on: on,
      by_left: by_left,
      by_right: by_right,
      by: by,
      strategy: strategy,
      suffix: suffix,
      tolerance: tolerance,
      allow_parallel: allow_parallel,
      force_parallel: force_parallel
    )
    .collect(no_optimization: true)
end

#lazyLazyFrame

Start a lazy query from this point.



3588
3589
3590
# File 'lib/polars/data_frame.rb', line 3588

def lazy
  wrap_ldf(_df.lazy)
end

#limit(n = 5) ⇒ DataFrame

Get the first n rows.

Alias for #head.

Examples:

df = Polars::DataFrame.new(
  {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
)
df.limit(4)
# =>
# shape: (4, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ str │
# ╞═════╪═════╡
# │ 1   ┆ a   │
# │ 2   ┆ b   │
# │ 3   ┆ c   │
# │ 4   ┆ d   │
# └─────┴─────┘


1630
1631
1632
# File 'lib/polars/data_frame.rb', line 1630

def limit(n = 5)
  head(n)
end

#max(axis: 0) ⇒ DataFrame

Aggregate the columns of this DataFrame to their maximum value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.max
# =>
# shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8   ┆ c   │
# └─────┴─────┴─────┘


3779
3780
3781
3782
3783
3784
3785
3786
3787
# File 'lib/polars/data_frame.rb', line 3779

def max(axis: 0)
  if axis == 0
    lazy.max.collect(_eager: true)
  elsif axis == 1
    Utils.wrap_s(_df.max_horizontal)
  else
    raise ArgumentError, "Axis should be 0 or 1."
  end
end

#mean(axis: 0, null_strategy: "ignore") ⇒ DataFrame

Aggregate the columns of this DataFrame to their mean value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.mean
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ f64 ┆ f64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 2.0 ┆ 7.0 ┆ null │
# └─────┴─────┴──────┘


3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
# File 'lib/polars/data_frame.rb', line 3897

def mean(axis: 0, null_strategy: "ignore")
  case axis
  when 0
    lazy.mean.collect(_eager: true)
  when 1
    Utils.wrap_s(_df.mean_horizontal(null_strategy))
  else
    raise ArgumentError, "Axis should be 0 or 1."
  end
end

#medianDataFrame

Aggregate the columns of this DataFrame to their median value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.median
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ f64 ┆ f64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 2.0 ┆ 7.0 ┆ null │
# └─────┴─────┴──────┘


4012
4013
4014
# File 'lib/polars/data_frame.rb', line 4012

def median
  lazy.median.collect(_eager: true)
end

#melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil) ⇒ DataFrame

Unpivot a DataFrame from wide to long format.

Optionally leaves identifiers set.

This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (id_vars), while all other columns, considered measured variables (value_vars), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => ["x", "y", "z"],
    "b" => [1, 3, 5],
    "c" => [2, 4, 6]
  }
)
df.melt(id_vars: "a", value_vars: ["b", "c"])
# =>
# shape: (6, 3)
# ┌─────┬──────────┬───────┐
# │ a   ┆ variable ┆ value │
# │ --- ┆ ---      ┆ ---   │
# │ str ┆ str      ┆ i64   │
# ╞═════╪══════════╪═══════╡
# │ x   ┆ b        ┆ 1     │
# │ y   ┆ b        ┆ 3     │
# │ z   ┆ b        ┆ 5     │
# │ x   ┆ c        ┆ 2     │
# │ y   ┆ c        ┆ 4     │
# │ z   ┆ c        ┆ 6     │
# └─────┴──────────┴───────┘


3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
# File 'lib/polars/data_frame.rb', line 3212

def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
  if value_vars.is_a?(::String)
    value_vars = [value_vars]
  end
  if id_vars.is_a?(::String)
    id_vars = [id_vars]
  end
  if value_vars.nil?
    value_vars = []
  end
  if id_vars.nil?
    id_vars = []
  end
  _from_rbdf(
    _df.melt(id_vars, value_vars, value_name, variable_name)
  )
end

#min(axis: 0) ⇒ DataFrame

Aggregate the columns of this DataFrame to their minimum value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.min
# =>
# shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# └─────┴─────┴─────┘


3811
3812
3813
3814
3815
3816
3817
3818
3819
# File 'lib/polars/data_frame.rb', line 3811

def min(axis: 0)
  if axis == 0
    lazy.min.collect(_eager: true)
  elsif axis == 1
    Utils.wrap_s(_df.min_horizontal)
  else
    raise ArgumentError, "Axis should be 0 or 1."
  end
end

#n_chunks(strategy: "first") ⇒ Object

Get number of chunks used by the ChunkedArrays of this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 4],
    "b" => [0.5, 4, 10, 13],
    "c" => [true, true, false, true]
  }
)
df.n_chunks
# => 1
df.n_chunks(strategy: "all")
# => [1, 1, 1]


3747
3748
3749
3750
3751
3752
3753
3754
3755
# File 'lib/polars/data_frame.rb', line 3747

def n_chunks(strategy: "first")
  if strategy == "first"
    _df.n_chunks
  elsif strategy == "all"
    get_columns.map(&:n_chunks)
  else
    raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first',  'all'}}"
  end
end

#n_unique(subset: nil) ⇒ DataFrame

Return the number of unique rows, or the number of unique row-subsets.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 1, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
    "c" => [true, true, true, false, true, true]
  }
)
df.n_unique
# => 5

Simple columns subset

df.n_unique(subset: ["b", "c"])
# => 4

Expression subset

df.n_unique(
  subset: [
    (Polars.col("a").floordiv(2)),
    (Polars.col("c") | (Polars.col("b") >= 2))
  ]
)
# => 3


4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
# File 'lib/polars/data_frame.rb', line 4185

def n_unique(subset: nil)
  if subset.is_a?(StringIO)
    subset = [Polars.col(subset)]
  elsif subset.is_a?(Expr)
    subset = [subset]
  end

  if subset.is_a?(::Array) && subset.length == 1
    expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
  else
    struct_fields = subset.nil? ? Polars.all : subset
    expr = Polars.struct(struct_fields)
  end

  df = lazy.select(expr.n_unique).collect
  df.is_empty ? 0 : df.row(0)[0]
end

#null_countDataFrame

Create a new DataFrame that shows the null counts per column.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, nil, 3],
    "bar" => [6, 7, nil],
    "ham" => ["a", "b", "c"]
  }
)
df.null_count
# =>
# shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ u32 ┆ u32 ┆ u32 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 1   ┆ 0   │
# └─────┴─────┴─────┘


4235
4236
4237
# File 'lib/polars/data_frame.rb', line 4235

def null_count
  _from_rbdf(_df.null_count)
end

#partition_by(groups, maintain_order: true, include_key: true, as_dict: false) ⇒ Object

Split into multiple DataFrames partitioned by groups.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => ["A", "A", "B", "B", "C"],
    "N" => [1, 2, 2, 4, 2],
    "bar" => ["k", "l", "m", "m", "l"]
  }
)
df.partition_by("foo", maintain_order: true)
# =>
# [shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ A   ┆ 1   ┆ k   │
# │ A   ┆ 2   ┆ l   │
# └─────┴─────┴─────┘, shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ B   ┆ 2   ┆ m   │
# │ B   ┆ 4   ┆ m   │
# └─────┴─────┴─────┘, shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ C   ┆ 2   ┆ l   │
# └─────┴─────┴─────┘]
df.partition_by("foo", maintain_order: true, as_dict: true)
# =>
# {"A"=>shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ A   ┆ 1   ┆ k   │
# │ A   ┆ 2   ┆ l   │
# └─────┴─────┴─────┘, "B"=>shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ B   ┆ 2   ┆ m   │
# │ B   ┆ 4   ┆ m   │
# └─────┴─────┴─────┘, "C"=>shape: (1, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ N   ┆ bar │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ C   ┆ 2   ┆ l   │
# └─────┴─────┴─────┘}


3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
# File 'lib/polars/data_frame.rb', line 3429

def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
  if groups.is_a?(::String)
    groups = [groups]
  elsif !groups.is_a?(::Array)
    groups = Array(groups)
  end

  if as_dict
    out = {}
    if groups.length == 1
      _df.partition_by(groups, maintain_order, include_key).each do |df|
        df = _from_rbdf(df)
        out[df[groups][0, 0]] = df
      end
    else
      _df.partition_by(groups, maintain_order, include_key).each do |df|
        df = _from_rbdf(df)
        out[df[groups].row(0)] = df
      end
    end
    out
  else
    _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
  end
end

#pipe(func, *args, **kwargs, &block) ⇒ Object

Note:

It is recommended to use LazyFrame when piping operations, in order to fully take advantage of query optimization and parallelization. See #lazy.

Offers a structured way to apply a sequence of user-defined functions (UDFs).

Examples:

cast_str_to_int = lambda do |data, col_name:|
  data.with_column(Polars.col(col_name).cast(:i64))
end

df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
df.pipe(cast_str_to_int, col_name: "b")
# =>
# shape: (4, 2)
# ┌─────┬─────┐
# │ a   ┆ b   │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 10  │
# │ 2   ┆ 20  │
# │ 3   ┆ 30  │
# │ 4   ┆ 40  │
# └─────┴─────┘


1765
1766
1767
# File 'lib/polars/data_frame.rb', line 1765

def pipe(func, *args, **kwargs, &block)
  func.call(self, *args, **kwargs, &block)
end

#pivot(values:, index:, columns:, aggregate_fn: "first", maintain_order: true, sort_columns: false, separator: "_") ⇒ DataFrame

Create a spreadsheet-style pivot table as a DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => ["one", "one", "one", "two", "two", "two"],
    "bar" => ["A", "B", "C", "A", "B", "C"],
    "baz" => [1, 2, 3, 4, 5, 6]
  }
)
df.pivot(values: "baz", index: "foo", columns: "bar")
# =>
# shape: (2, 4)
# ┌─────┬─────┬─────┬─────┐
# │ foo ┆ A   ┆ B   ┆ C   │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪═════╡
# │ one ┆ 1   ┆ 2   ┆ 3   │
# │ two ┆ 4   ┆ 5   ┆ 6   │
# └─────┴─────┴─────┴─────┘


3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
# File 'lib/polars/data_frame.rb', line 3109

def pivot(
  values:,
  index:,
  columns:,
  aggregate_fn: "first",
  maintain_order: true,
  sort_columns: false,
  separator: "_"
)
  if values.is_a?(::String)
    values = [values]
  end
  if index.is_a?(::String)
    index = [index]
  end
  if columns.is_a?(::String)
    columns = [columns]
  end

  if aggregate_fn.is_a?(::String)
    case aggregate_fn
    when "first"
      aggregate_expr = Polars.element.first._rbexpr
    when "sum"
      aggregate_expr = Polars.element.sum._rbexpr
    when "max"
      aggregate_expr = Polars.element.max._rbexpr
    when "min"
      aggregate_expr = Polars.element.min._rbexpr
    when "mean"
      aggregate_expr = Polars.element.mean._rbexpr
    when "median"
      aggregate_expr = Polars.element.median._rbexpr
    when "last"
      aggregate_expr = Polars.element.last._rbexpr
    when "count"
      aggregate_expr = Polars.count._rbexpr
    else
      raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
    end
  elsif aggregate_fn.nil?
    aggregate_expr = nil
  else
    aggregate_expr = aggregate_function._rbexpr
  end

  _from_rbdf(
    _df.pivot_expr(
      values,
      index,
      columns,
      maintain_order,
      sort_columns,
      aggregate_expr,
      separator
    )
  )
end

#plot(x = nil, y = nil, type: nil, group: nil, stacked: nil) ⇒ Vega::LiteChart Originally defined in module Plot

Plot data.

Raises:

  • (ArgumentError)

#productDataFrame

Aggregate the columns of this DataFrame to their product values.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3],
    "b" => [0.5, 4, 10],
    "c" => [true, true, false]
  }
)
df.product
# =>
# shape: (1, 3)
# ┌─────┬──────┬─────┐
# │ a   ┆ b    ┆ c   │
# │ --- ┆ ---  ┆ --- │
# │ i64 ┆ f64  ┆ i64 │
# ╞═════╪══════╪═════╡
# │ 6   ┆ 20.0 ┆ 0   │
# └─────┴──────┴─────┘


4038
4039
4040
# File 'lib/polars/data_frame.rb', line 4038

def product
  select(Polars.all.product)
end

#quantile(quantile, interpolation: "nearest") ⇒ DataFrame

Aggregate the columns of this DataFrame to their quantile value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.quantile(0.5, interpolation: "nearest")
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ f64 ┆ f64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 2.0 ┆ 7.0 ┆ null │
# └─────┴─────┴──────┘


4069
4070
4071
# File 'lib/polars/data_frame.rb', line 4069

def quantile(quantile, interpolation: "nearest")
  lazy.quantile(quantile, interpolation: interpolation).collect(_eager: true)
end

#rechunkDataFrame

This will make sure all subsequent operations have optimal and predictable performance.



4209
4210
4211
# File 'lib/polars/data_frame.rb', line 4209

def rechunk
  _from_rbdf(_df.rechunk)
end

#rename(mapping) ⇒ DataFrame

Rename column names.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.rename({"foo" => "apple"})
# =>
# shape: (3, 3)
# ┌───────┬─────┬─────┐
# │ apple ┆ bar ┆ ham │
# │ ---   ┆ --- ┆ --- │
# │ i64   ┆ i64 ┆ str │
# ╞═══════╪═════╪═════╡
# │ 1     ┆ 6   ┆ a   │
# │ 2     ┆ 7   ┆ b   │
# │ 3     ┆ 8   ┆ c   │
# └───────┴─────┴─────┘


1209
1210
1211
# File 'lib/polars/data_frame.rb', line 1209

def rename(mapping)
  lazy.rename(mapping).collect(no_optimization: true)
end

#replace(column, new_col) ⇒ DataFrame

Replace a column by a new Series.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
s = Polars::Series.new([10, 20, 30])
df.replace("foo", s)
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 10  ┆ 4   │
# │ 20  ┆ 5   │
# │ 30  ┆ 6   │
# └─────┴─────┘


1563
1564
1565
1566
# File 'lib/polars/data_frame.rb', line 1563

def replace(column, new_col)
  _df.replace(column.to_s, new_col._s)
  self
end

#replace_column(index, series) ⇒ DataFrame Also known as: replace_at_idx

Replace a column at an index location.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
s = Polars::Series.new("apple", [10, 20, 30])
df.replace_column(0, s)
# =>
# shape: (3, 3)
# ┌───────┬─────┬─────┐
# │ apple ┆ bar ┆ ham │
# │ ---   ┆ --- ┆ --- │
# │ i64   ┆ i64 ┆ str │
# ╞═══════╪═════╪═════╡
# │ 10    ┆ 6   ┆ a   │
# │ 20    ┆ 7   ┆ b   │
# │ 30    ┆ 8   ┆ c   │
# └───────┴─────┴─────┘


1429
1430
1431
1432
1433
1434
1435
# File 'lib/polars/data_frame.rb', line 1429

def replace_column(index, series)
  if index < 0
    index = columns.length + index
  end
  _df.replace_column(index, series._s)
  self
end

#reverseDataFrame

Reverse the DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "key" => ["a", "b", "c"],
    "val" => [1, 2, 3]
  }
)
df.reverse
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ key ┆ val │
# │ --- ┆ --- │
# │ str ┆ i64 │
# ╞═════╪═════╡
# │ c   ┆ 3   │
# │ b   ┆ 2   │
# │ a   ┆ 1   │
# └─────┴─────┘


1178
1179
1180
# File 'lib/polars/data_frame.rb', line 1178

def reverse
  select(Polars.col("*").reverse)
end

#row(index = nil, by_predicate: nil, named: false) ⇒ Object

Note:

The index and by_predicate params are mutually exclusive. Additionally, to ensure clarity, the by_predicate parameter must be supplied by keyword.

When using by_predicate it is an error condition if anything other than one row is returned; more than one row raises TooManyRowsReturned, and zero rows will raise NoRowsReturned (both inherit from RowsException).

Get a row as tuple, either by index or by predicate.

Examples:

Return the row at the given index

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.row(2)
# => [3, 8, "c"]

Get a hash instead with a mapping of column names to row values

df.row(2, named: true)
# => {"foo"=>3, "bar"=>8, "ham"=>"c"}

Return the row that matches the given predicate

df.row(by_predicate: Polars.col("ham") == "b")
# => [2, 7, "b"]


4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
# File 'lib/polars/data_frame.rb', line 4430

def row(index = nil, by_predicate: nil, named: false)
  if !index.nil? && !by_predicate.nil?
    raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
  elsif index.is_a?(Expr)
    raise TypeError, "Expressions should be passed to the 'by_predicate' param"
  end

  if !index.nil?
    row = _df.row_tuple(index)
    if named
      columns.zip(row).to_h
    else
      row
    end
  elsif !by_predicate.nil?
    if !by_predicate.is_a?(Expr)
      raise TypeError, "Expected by_predicate to be an expression; found #{by_predicate.class.name}"
    end
    rows = filter(by_predicate).rows
    n_rows = rows.length
    if n_rows > 1
      raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
    elsif n_rows == 0
      raise NoRowsReturned, "Predicate #{by_predicate} returned no rows"
    end
    row = rows[0]
    if named
      columns.zip(row).to_h
    else
      row
    end
  else
    raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
  end
end

#rows(named: false) ⇒ Array

Convert columnar data to rows as Ruby arrays.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 3, 5],
    "b" => [2, 4, 6]
  }
)
df.rows
# => [[1, 2], [3, 4], [5, 6]]
df.rows(named: true)
# => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]


4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
# File 'lib/polars/data_frame.rb', line 4487

def rows(named: false)
  if named
    columns = columns()
    _df.row_tuples.map do |v|
      columns.zip(v).to_h
    end
  else
    _df.row_tuples
  end
end

#sample(n: nil, frac: nil, with_replacement: false, shuffle: false, seed: nil) ⇒ DataFrame

Sample from this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.sample(n: 2, seed: 0)
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8   ┆ c   │
# │ 2   ┆ 7   ┆ b   │
# └─────┴─────┴─────┘


4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
# File 'lib/polars/data_frame.rb', line 4275

def sample(
  n: nil,
  frac: nil,
  with_replacement: false,
  shuffle: false,
  seed: nil
)
  if !n.nil? && !frac.nil?
    raise ArgumentError, "cannot specify both `n` and `frac`"
  end

  if n.nil? && !frac.nil?
    frac = Series.new("frac", [frac]) unless frac.is_a?(Series)

    _from_rbdf(
      _df.sample_frac(frac._s, with_replacement, shuffle, seed)
    )
  end

  if n.nil?
    n = 1
  end

  n = Series.new("", [n]) unless n.is_a?(Series)

  _from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
end

#schemaHash

Get the schema.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df.schema
# => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}


433
434
435
# File 'lib/polars/data_frame.rb', line 433

def schema
  columns.zip(dtypes).to_h
end

#select(exprs) ⇒ DataFrame

Select columns from this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.select("foo")
# =>
# shape: (3, 1)
# ┌─────┐
# │ foo │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 1   │
# │ 2   │
# │ 3   │
# └─────┘
df.select(["foo", "bar"])
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 1   ┆ 6   │
# │ 2   ┆ 7   │
# │ 3   ┆ 8   │
# └─────┴─────┘
df.select(Polars.col("foo") + 1)
# =>
# shape: (3, 1)
# ┌─────┐
# │ foo │
# │ --- │
# │ i64 │
# ╞═════╡
# │ 2   │
# │ 3   │
# │ 4   │
# └─────┘
df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
# =>
# shape: (3, 2)
# ┌─────┬─────┐
# │ foo ┆ bar │
# │ --- ┆ --- │
# │ i64 ┆ i64 │
# ╞═════╪═════╡
# │ 2   ┆ 7   │
# │ 3   ┆ 8   │
# │ 4   ┆ 9   │
# └─────┴─────┘
df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
# =>
# shape: (3, 1)
# ┌─────────┐
# │ literal │
# │ ---     │
# │ i64     │
# ╞═════════╡
# │ 0       │
# │ 0       │
# │ 10      │
# └─────────┘


3675
3676
3677
3678
3679
3680
3681
3682
# File 'lib/polars/data_frame.rb', line 3675

def select(exprs)
  _from_rbdf(
    lazy
      .select(exprs)
      .collect(no_optimization: true, string_cache: false)
      ._df
  )
end

#set_sorted(column, *more_columns, descending: false) ⇒ DataFrame

Indicate that one or multiple columns are sorted.



4785
4786
4787
4788
4789
4790
4791
4792
4793
# File 'lib/polars/data_frame.rb', line 4785

def set_sorted(
  column,
  *more_columns,
  descending: false
)
  lazy
    .set_sorted(column, *more_columns, descending: descending)
    .collect(no_optimization: true)
end

#shapeArray

Get the shape of the DataFrame.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
df.shape
# => [5, 1]


319
320
321
# File 'lib/polars/data_frame.rb', line 319

def shape
  _df.shape
end

#shift(n, fill_value: nil) ⇒ DataFrame

Shift values by the given period.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.shift(1)
# =>
# shape: (3, 3)
# ┌──────┬──────┬──────┐
# │ foo  ┆ bar  ┆ ham  │
# │ ---  ┆ ---  ┆ ---  │
# │ i64  ┆ i64  ┆ str  │
# ╞══════╪══════╪══════╡
# │ null ┆ null ┆ null │
# │ 1    ┆ 6    ┆ a    │
# │ 2    ┆ 7    ┆ b    │
# └──────┴──────┴──────┘
df.shift(-1)
# =>
# shape: (3, 3)
# ┌──────┬──────┬──────┐
# │ foo  ┆ bar  ┆ ham  │
# │ ---  ┆ ---  ┆ ---  │
# │ i64  ┆ i64  ┆ str  │
# ╞══════╪══════╪══════╡
# │ 2    ┆ 7    ┆ b    │
# │ 3    ┆ 8    ┆ c    │
# │ null ┆ null ┆ null │
# └──────┴──────┴──────┘


3498
3499
3500
# File 'lib/polars/data_frame.rb', line 3498

def shift(n, fill_value: nil)
  lazy.shift(n, fill_value: fill_value).collect(_eager: true)
end

#shift_and_fill(periods, fill_value) ⇒ DataFrame

Shift the values by a given period and fill the resulting null values.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.shift_and_fill(1, 0)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 0   ┆ 0   ┆ 0   │
# │ 1   ┆ 6   ┆ a   │
# │ 2   ┆ 7   ┆ b   │
# └─────┴─────┴─────┘


3531
3532
3533
# File 'lib/polars/data_frame.rb', line 3531

def shift_and_fill(periods, fill_value)
  shift(periods, fill_value: fill_value)
end

#shrink_to_fit(in_place: false) ⇒ DataFrame

Shrink DataFrame memory usage.

Shrinks to fit the exact capacity needed to hold the data.



4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
# File 'lib/polars/data_frame.rb', line 4582

def shrink_to_fit(in_place: false)
  if in_place
    _df.shrink_to_fit
    self
  else
    df = clone
    df._df.shrink_to_fit
    df
  end
end

#slice(offset, length = nil) ⇒ DataFrame

Get a slice of this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df.slice(1, 2)
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ f64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 2   ┆ 7.0 ┆ b   │
# │ 3   ┆ 8.0 ┆ c   │
# └─────┴─────┴─────┘


1597
1598
1599
1600
1601
1602
# File 'lib/polars/data_frame.rb', line 1597

def slice(offset, length = nil)
  if !length.nil? && length < 0
    length = height - offset + length
  end
  _from_rbdf(_df.slice(offset, length))
end

#sort(by, reverse: false, nulls_last: false) ⇒ DataFrame

Sort the DataFrame by column.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6.0, 7.0, 8.0],
    "ham" => ["a", "b", "c"]
  }
)
df.sort("foo", reverse: true)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ f64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8.0 ┆ c   │
# │ 2   ┆ 7.0 ┆ b   │
# │ 1   ┆ 6.0 ┆ a   │
# └─────┴─────┴─────┘

Sort by multiple columns.

df.sort(
  [Polars.col("foo"), Polars.col("bar")**2],
  reverse: [true, false]
)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ f64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8.0 ┆ c   │
# │ 2   ┆ 7.0 ┆ b   │
# │ 1   ┆ 6.0 ┆ a   │
# └─────┴─────┴─────┘


1486
1487
1488
1489
1490
# File 'lib/polars/data_frame.rb', line 1486

def sort(by, reverse: false, nulls_last: false)
  lazy
    .sort(by, reverse: reverse, nulls_last: nulls_last)
    .collect(no_optimization: true)
end

#sort!(by, reverse: false, nulls_last: false) ⇒ DataFrame

Sort the DataFrame by column in-place.



1502
1503
1504
# File 'lib/polars/data_frame.rb', line 1502

def sort!(by, reverse: false, nulls_last: false)
  self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
end

#std(ddof: 1) ⇒ DataFrame

Aggregate the columns of this DataFrame to their standard deviation value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.std
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ f64 ┆ f64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 1.0 ┆ 1.0 ┆ null │
# └─────┴─────┴──────┘
df.std(ddof: 0)
# =>
# shape: (1, 3)
# ┌──────────┬──────────┬──────┐
# │ foo      ┆ bar      ┆ ham  │
# │ ---      ┆ ---      ┆ ---  │
# │ f64      ┆ f64      ┆ str  │
# ╞══════════╪══════════╪══════╡
# │ 0.816497 ┆ 0.816497 ┆ null │
# └──────────┴──────────┴──────┘


3945
3946
3947
# File 'lib/polars/data_frame.rb', line 3945

def std(ddof: 1)
  lazy.std(ddof: ddof).collect(_eager: true)
end

#sum(axis: 0, null_strategy: "ignore") ⇒ DataFrame

Aggregate the columns of this DataFrame to their sum value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"],
  }
)
df.sum
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ i64 ┆ i64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 6   ┆ 21  ┆ null │
# └─────┴─────┴──────┘
df.sum(axis: 1)
# =>
# shape: (3,)
# Series: 'foo' [str]
# [
#         "16a"
#         "27b"
#         "38c"
# ]


3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
# File 'lib/polars/data_frame.rb', line 3859

def sum(axis: 0, null_strategy: "ignore")
  case axis
  when 0
    lazy.sum.collect(_eager: true)
  when 1
    Utils.wrap_s(_df.sum_horizontal(null_strategy))
  else
    raise ArgumentError, "Axis should be 0 or 1."
  end
end

#tail(n = 5) ⇒ DataFrame

Get the last n rows.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3, 4, 5],
    "bar" => [6, 7, 8, 9, 10],
    "ham" => ["a", "b", "c", "d", "e"]
  }
)
df.tail(3)
# =>
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 3   ┆ 8   ┆ c   │
# │ 4   ┆ 9   ┆ d   │
# │ 5   ┆ 10  ┆ e   │
# └─────┴─────┴─────┘


1692
1693
1694
# File 'lib/polars/data_frame.rb', line 1692

def tail(n = 5)
  _from_rbdf(_df.tail(n))
end

#to_aArray

Returns an array representing the DataFrame



550
551
552
# File 'lib/polars/data_frame.rb', line 550

def to_a
  rows(named: true)
end

#to_csv(**options) ⇒ String

Write to comma-separated values (CSV) string.



962
963
964
# File 'lib/polars/data_frame.rb', line 962

def to_csv(**options)
  write_csv(**options)
end

#to_dummies(columns: nil, separator: "_", drop_first: false) ⇒ DataFrame

Get one hot encoded dummy variables.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2],
    "bar" => [3, 4],
    "ham" => ["a", "b"]
  }
)
df.to_dummies
# =>
# shape: (2, 6)
# ┌───────┬───────┬───────┬───────┬───────┬───────┐
# │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
# │ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
# │ u8    ┆ u8    ┆ u8    ┆ u8    ┆ u8    ┆ u8    │
# ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
# │ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     │
# │ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     │
# └───────┴───────┴───────┴───────┴───────┴───────┘


4100
4101
4102
4103
4104
4105
# File 'lib/polars/data_frame.rb', line 4100

def to_dummies(columns: nil, separator: "_", drop_first: false)
  if columns.is_a?(::String)
    columns = [columns]
  end
  _from_rbdf(_df.to_dummies(columns, separator, drop_first))
end

#to_h(as_series: true) ⇒ Hash

Convert DataFrame to a hash mapping column name to values.



733
734
735
736
737
738
739
# File 'lib/polars/data_frame.rb', line 733

def to_h(as_series: true)
  if as_series
    get_columns.to_h { |s| [s.name, s] }
  else
    get_columns.to_h { |s| [s.name, s.to_a] }
  end
end

#to_hashesArray

Convert every row to a dictionary.

Note that this is slow.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
df.to_hashes
# =>
# [{"foo"=>1, "bar"=>4}, {"foo"=>2, "bar"=>5}, {"foo"=>3, "bar"=>6}]


752
753
754
755
756
757
758
759
# File 'lib/polars/data_frame.rb', line 752

def to_hashes
  rbdf = _df
  names = columns

  height.times.map do |i|
    names.zip(rbdf.row_tuple(i)).to_h
  end
end

#to_numoNumo::NArray

Convert DataFrame to a 2D Numo array.

This operation clones data.

Examples:

df = Polars::DataFrame.new(
  {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
)
df.to_numo.class
# => Numo::RObject


773
774
775
776
777
778
779
780
# File 'lib/polars/data_frame.rb', line 773

def to_numo
  out = _df.to_numo
  if out.nil?
    Numo::NArray.vstack(width.times.map { |i| to_series(i).to_numo }).transpose
  else
    out
  end
end

#to_sString Also known as: inspect

Returns a string representing the DataFrame.



542
543
544
# File 'lib/polars/data_frame.rb', line 542

def to_s
  _df.to_s
end

#to_series(index = 0) ⇒ Series

Select column as Series at index location.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.to_series(1)
# =>
# shape: (3,)
# Series: 'bar' [i64]
# [
#         6
#         7
#         8
# ]


808
809
810
811
812
813
# File 'lib/polars/data_frame.rb', line 808

def to_series(index = 0)
  if index < 0
    index = columns.length + index
  end
  Utils.wrap_s(_df.select_at_idx(index))
end

#to_struct(name) ⇒ Series

Convert a DataFrame to a Series of type Struct.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 4, 5],
    "b" => ["one", "two", "three", "four", "five"]
  }
)
df.to_struct("nums")
# =>
# shape: (5,)
# Series: 'nums' [struct[2]]
# [
#         {1,"one"}
#         {2,"two"}
#         {3,"three"}
#         {4,"four"}
#         {5,"five"}
# ]


4724
4725
4726
# File 'lib/polars/data_frame.rb', line 4724

def to_struct(name)
  Utils.wrap_s(_df.to_struct(name))
end

#transpose(include_header: false, header_name: "column", column_names: nil) ⇒ DataFrame

Note:

This is a very expensive operation. Perhaps you can do it differently.

Transpose a DataFrame over the diagonal.

Examples:

df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
df.transpose(include_header: true)
# =>
# shape: (2, 4)
# ┌────────┬──────────┬──────────┬──────────┐
# │ column ┆ column_0 ┆ column_1 ┆ column_2 │
# │ ---    ┆ ---      ┆ ---      ┆ ---      │
# │ str    ┆ i64      ┆ i64      ┆ i64      │
# ╞════════╪══════════╪══════════╪══════════╡
# │ a      ┆ 1        ┆ 2        ┆ 3        │
# │ b      ┆ 1        ┆ 2        ┆ 3        │
# └────────┴──────────┴──────────┴──────────┘

Replace the auto-generated column names with a list

df.transpose(include_header: false, column_names: ["a", "b", "c"])
# =>
# shape: (2, 3)
# ┌─────┬─────┬─────┐
# │ a   ┆ b   ┆ c   │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 2   ┆ 3   │
# │ 1   ┆ 2   ┆ 3   │
# └─────┴─────┴─────┘

Include the header as a separate column

df.transpose(
  include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
)
# =>
# shape: (2, 4)
# ┌─────┬─────┬─────┬─────┐
# │ foo ┆ a   ┆ b   ┆ c   │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪═════╡
# │ a   ┆ 1   ┆ 2   ┆ 3   │
# │ b   ┆ 1   ┆ 2   ┆ 3   │
# └─────┴─────┴─────┴─────┘


1150
1151
1152
1153
# File 'lib/polars/data_frame.rb', line 1150

def transpose(include_header: false, header_name: "column", column_names: nil)
  keep_names_as = include_header ? header_name : nil
  _from_rbdf(_df.transpose(keep_names_as, column_names))
end

#unique(maintain_order: true, subset: nil, keep: "first") ⇒ DataFrame

Note:

Note that this fails if there is a column of type List in the DataFrame or subset.

Drop duplicate rows from this DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 1, 2, 3, 4, 5],
    "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
    "c" => [true, true, true, false, true, true]
  }
)
df.unique
# =>
# shape: (5, 3)
# ┌─────┬─────┬───────┐
# │ a   ┆ b   ┆ c     │
# │ --- ┆ --- ┆ ---   │
# │ i64 ┆ f64 ┆ bool  │
# ╞═════╪═════╪═══════╡
# │ 1   ┆ 0.5 ┆ true  │
# │ 2   ┆ 1.0 ┆ true  │
# │ 3   ┆ 2.0 ┆ false │
# │ 4   ┆ 3.0 ┆ true  │
# │ 5   ┆ 3.0 ┆ true  │
# └─────┴─────┴───────┘


4145
4146
4147
4148
4149
4150
4151
4152
# File 'lib/polars/data_frame.rb', line 4145

def unique(maintain_order: true, subset: nil, keep: "first")
  self._from_rbdf(
    lazy
      .unique(maintain_order: maintain_order, subset: subset, keep: keep)
      .collect(no_optimization: true)
      ._df
  )
end

#unnest(names) ⇒ DataFrame

Decompose a struct into its fields.

The fields will be inserted into the DataFrame on the location of the struct type.

Examples:

df = Polars::DataFrame.new(
  {
    "before" => ["foo", "bar"],
    "t_a" => [1, 2],
    "t_b" => ["a", "b"],
    "t_c" => [true, nil],
    "t_d" => [[1, 2], [3]],
    "after" => ["baz", "womp"]
  }
).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
df.unnest("t_struct")
# =>
# shape: (2, 6)
# ┌────────┬─────┬─────┬──────┬───────────┬───────┐
# │ before ┆ t_a ┆ t_b ┆ t_c  ┆ t_d       ┆ after │
# │ ---    ┆ --- ┆ --- ┆ ---  ┆ ---       ┆ ---   │
# │ str    ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str   │
# ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
# │ foo    ┆ 1   ┆ a   ┆ true ┆ [1, 2]    ┆ baz   │
# │ bar    ┆ 2   ┆ b   ┆ null ┆ [3]       ┆ womp  │
# └────────┴─────┴─────┴──────┴───────────┴───────┘


4760
4761
4762
4763
4764
4765
# File 'lib/polars/data_frame.rb', line 4760

def unnest(names)
  if names.is_a?(::String)
    names = [names]
  end
  _from_rbdf(_df.unnest(names))
end

#unstack(step:, how: "vertical", columns: nil, fill_values: nil) ⇒ DataFrame

Note:

This functionality is experimental and may be subject to changes without it being considered a breaking change.

Unstack a long table to a wide form without doing an aggregation.

This can be much faster than a pivot, because it can skip the grouping phase.

Examples:

df = Polars::DataFrame.new(
  {
    "col1" => "A".."I",
    "col2" => Polars.arange(0, 9, eager: true)
  }
)
# =>
# shape: (9, 2)
# ┌──────┬──────┐
# │ col1 ┆ col2 │
# │ ---  ┆ ---  │
# │ str  ┆ i64  │
# ╞══════╪══════╡
# │ A    ┆ 0    │
# │ B    ┆ 1    │
# │ C    ┆ 2    │
# │ D    ┆ 3    │
# │ E    ┆ 4    │
# │ F    ┆ 5    │
# │ G    ┆ 6    │
# │ H    ┆ 7    │
# │ I    ┆ 8    │
# └──────┴──────┘
df.unstack(step: 3, how: "vertical")
# =>
# shape: (3, 6)
# ┌────────┬────────┬────────┬────────┬────────┬────────┐
# │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
# │ ---    ┆ ---    ┆ ---    ┆ ---    ┆ ---    ┆ ---    │
# │ str    ┆ str    ┆ str    ┆ i64    ┆ i64    ┆ i64    │
# ╞════════╪════════╪════════╪════════╪════════╪════════╡
# │ A      ┆ D      ┆ G      ┆ 0      ┆ 3      ┆ 6      │
# │ B      ┆ E      ┆ H      ┆ 1      ┆ 4      ┆ 7      │
# │ C      ┆ F      ┆ I      ┆ 2      ┆ 5      ┆ 8      │
# └────────┴────────┴────────┴────────┴────────┴────────┘
df.unstack(step: 3, how: "horizontal")
# =>
# shape: (3, 6)
# ┌────────┬────────┬────────┬────────┬────────┬────────┐
# │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
# │ ---    ┆ ---    ┆ ---    ┆ ---    ┆ ---    ┆ ---    │
# │ str    ┆ str    ┆ str    ┆ i64    ┆ i64    ┆ i64    │
# ╞════════╪════════╪════════╪════════╪════════╪════════╡
# │ A      ┆ B      ┆ C      ┆ 0      ┆ 1      ┆ 2      │
# │ D      ┆ E      ┆ F      ┆ 3      ┆ 4      ┆ 5      │
# │ G      ┆ H      ┆ I      ┆ 6      ┆ 7      ┆ 8      │
# └────────┴────────┴────────┴────────┴────────┴────────┘


3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
# File 'lib/polars/data_frame.rb', line 3301

def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
  if !columns.nil?
    df = select(columns)
  else
    df = self
  end

  height = df.height
  if how == "vertical"
    n_rows = step
    n_cols = (height / n_rows.to_f).ceil
  else
    n_cols = step
    n_rows = (height / n_cols.to_f).ceil
  end

  n_fill = n_cols * n_rows - height

  if n_fill > 0
    if !fill_values.is_a?(::Array)
      fill_values = [fill_values] * df.width
    end

    df = df.select(
      df.get_columns.zip(fill_values).map do |s, next_fill|
        s.extend_constant(next_fill, n_fill)
      end
    )
  end

  if how == "horizontal"
    df = (
      df.with_column(
        (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
          "__sort_order"
        )
      )
      .sort("__sort_order")
      .drop("__sort_order")
    )
  end

  zfill_val = Math.log10(n_cols).floor + 1
  slices =
    df.get_columns.flat_map do |s|
      n_cols.times.map do |slice_nbr|
        s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
      end
    end

  _from_rbdf(DataFrame.new(slices)._df)
end

#upsample(time_column:, every:, offset: nil, by: nil, maintain_order: false) ⇒ DataFrame

Upsample a DataFrame at a regular frequency.

The every and offset arguments are created with the following string language:

  • 1ns (1 nanosecond)
  • 1us (1 microsecond)
  • 1ms (1 millisecond)
  • 1s (1 second)
  • 1m (1 minute)
  • 1h (1 hour)
  • 1d (1 day)
  • 1w (1 week)
  • 1mo (1 calendar month)
  • 1y (1 calendar year)
  • 1i (1 index count)

Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds

Examples:

Upsample a DataFrame by a certain interval.

df = Polars::DataFrame.new(
  {
    "time" => [
      DateTime.new(2021, 2, 1),
      DateTime.new(2021, 4, 1),
      DateTime.new(2021, 5, 1),
      DateTime.new(2021, 6, 1)
    ],
    "groups" => ["A", "B", "A", "B"],
    "values" => [0, 1, 2, 3]
  }
).set_sorted("time")
df.upsample(
  time_column: "time", every: "1mo", by: "groups", maintain_order: true
).select(Polars.all.forward_fill)
# =>
# shape: (7, 3)
# ┌─────────────────────┬────────┬────────┐
# │ time                ┆ groups ┆ values │
# │ ---                 ┆ ---    ┆ ---    │
# │ datetime[ns]        ┆ str    ┆ i64    │
# ╞═════════════════════╪════════╪════════╡
# │ 2021-02-01 00:00:00 ┆ A      ┆ 0      │
# │ 2021-03-01 00:00:00 ┆ A      ┆ 0      │
# │ 2021-04-01 00:00:00 ┆ A      ┆ 0      │
# │ 2021-05-01 00:00:00 ┆ A      ┆ 2      │
# │ 2021-04-01 00:00:00 ┆ B      ┆ 1      │
# │ 2021-05-01 00:00:00 ┆ B      ┆ 1      │
# │ 2021-06-01 00:00:00 ┆ B      ┆ 3      │
# └─────────────────────┴────────┴────────┘


2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
# File 'lib/polars/data_frame.rb', line 2264

def upsample(
  time_column:,
  every:,
  offset: nil,
  by: nil,
  maintain_order: false
)
  if by.nil?
    by = []
  end
  if by.is_a?(::String)
    by = [by]
  end
  if offset.nil?
    offset = "0ns"
  end

  every = Utils._timedelta_to_pl_duration(every)
  offset = Utils._timedelta_to_pl_duration(offset)

  _from_rbdf(
    _df.upsample(by, time_column, every, offset, maintain_order)
  )
end

#var(ddof: 1) ⇒ DataFrame

Aggregate the columns of this DataFrame to their variance value.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3],
    "bar" => [6, 7, 8],
    "ham" => ["a", "b", "c"]
  }
)
df.var
# =>
# shape: (1, 3)
# ┌─────┬─────┬──────┐
# │ foo ┆ bar ┆ ham  │
# │ --- ┆ --- ┆ ---  │
# │ f64 ┆ f64 ┆ str  │
# ╞═════╪═════╪══════╡
# │ 1.0 ┆ 1.0 ┆ null │
# └─────┴─────┴──────┘
df.var(ddof: 0)
# =>
# shape: (1, 3)
# ┌──────────┬──────────┬──────┐
# │ foo      ┆ bar      ┆ ham  │
# │ ---      ┆ ---      ┆ ---  │
# │ f64      ┆ f64      ┆ str  │
# ╞══════════╪══════════╪══════╡
# │ 0.666667 ┆ 0.666667 ┆ null │
# └──────────┴──────────┴──────┘


3986
3987
3988
# File 'lib/polars/data_frame.rb', line 3986

def var(ddof: 1)
  lazy.var(ddof: ddof).collect(_eager: true)
end

#vstack(df, in_place: false) ⇒ DataFrame

Grow this DataFrame vertically by stacking a DataFrame to it.

Examples:

df1 = Polars::DataFrame.new(
  {
    "foo" => [1, 2],
    "bar" => [6, 7],
    "ham" => ["a", "b"]
  }
)
df2 = Polars::DataFrame.new(
  {
    "foo" => [3, 4],
    "bar" => [8, 9],
    "ham" => ["c", "d"]
  }
)
df1.vstack(df2)
# =>
# shape: (4, 3)
# ┌─────┬─────┬─────┐
# │ foo ┆ bar ┆ ham │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ str │
# ╞═════╪═════╪═════╡
# │ 1   ┆ 6   ┆ a   │
# │ 2   ┆ 7   ┆ b   │
# │ 3   ┆ 8   ┆ c   │
# │ 4   ┆ 9   ┆ d   │
# └─────┴─────┴─────┘


2725
2726
2727
2728
2729
2730
2731
2732
# File 'lib/polars/data_frame.rb', line 2725

def vstack(df, in_place: false)
  if in_place
    _df.vstack_mut(df._df)
    self
  else
    _from_rbdf(_df.vstack(df._df))
  end
end

#widthInteger

Get the width of the DataFrame.

Examples:

df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
df.width
# => 1


346
347
348
# File 'lib/polars/data_frame.rb', line 346

def width
  _df.width
end

#with_column(column) ⇒ DataFrame

Return a new DataFrame with the column added or replaced.

Examples:

Added

df = Polars::DataFrame.new(
  {
    "a" => [1, 3, 5],
    "b" => [2, 4, 6]
  }
)
df.with_column((Polars.col("b") ** 2).alias("b_squared"))
# =>
# shape: (3, 3)
# ┌─────┬─────┬───────────┐
# │ a   ┆ b   ┆ b_squared │
# │ --- ┆ --- ┆ ---       │
# │ i64 ┆ i64 ┆ f64       │
# ╞═════╪═════╪═══════════╡
# │ 1   ┆ 2   ┆ 4.0       │
# │ 3   ┆ 4   ┆ 16.0      │
# │ 5   ┆ 6   ┆ 36.0      │
# └─────┴─────┴───────────┘

Replaced

df.with_column(Polars.col("a") ** 2)
# =>
# shape: (3, 2)
# ┌──────┬─────┐
# │ a    ┆ b   │
# │ ---  ┆ --- │
# │ f64  ┆ i64 │
# ╞══════╪═════╡
# │ 1.0  ┆ 2   │
# │ 9.0  ┆ 4   │
# │ 25.0 ┆ 6   │
# └──────┴─────┘


2640
2641
2642
2643
2644
# File 'lib/polars/data_frame.rb', line 2640

def with_column(column)
  lazy
    .with_column(column)
    .collect(no_optimization: true, string_cache: false)
end

#with_columns(exprs) ⇒ DataFrame

Add or overwrite multiple columns in a DataFrame.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 2, 3, 4],
    "b" => [0.5, 4, 10, 13],
    "c" => [true, true, false, true]
  }
)
df.with_columns(
  [
    (Polars.col("a") ** 2).alias("a^2"),
    (Polars.col("b") / 2).alias("b/2"),
    (Polars.col("c").is_not).alias("not c")
  ]
)
# =>
# shape: (4, 6)
# ┌─────┬──────┬───────┬──────┬──────┬───────┐
# │ a   ┆ b    ┆ c     ┆ a^2  ┆ b/2  ┆ not c │
# │ --- ┆ ---  ┆ ---   ┆ ---  ┆ ---  ┆ ---   │
# │ i64 ┆ f64  ┆ bool  ┆ f64  ┆ f64  ┆ bool  │
# ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
# │ 1   ┆ 0.5  ┆ true  ┆ 1.0  ┆ 0.25 ┆ false │
# │ 2   ┆ 4.0  ┆ true  ┆ 4.0  ┆ 2.0  ┆ false │
# │ 3   ┆ 10.0 ┆ false ┆ 9.0  ┆ 5.0  ┆ true  │
# │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
# └─────┴──────┴───────┴──────┴──────┴───────┘


3718
3719
3720
3721
3722
3723
3724
3725
# File 'lib/polars/data_frame.rb', line 3718

def with_columns(exprs)
  if !exprs.nil? && !exprs.is_a?(::Array)
    exprs = [exprs]
  end
  lazy
    .with_columns(exprs)
    .collect(no_optimization: true, string_cache: false)
end

#with_row_count(name: "row_nr", offset: 0) ⇒ DataFrame

Add a column at index 0 that counts the rows.

Examples:

df = Polars::DataFrame.new(
  {
    "a" => [1, 3, 5],
    "b" => [2, 4, 6]
  }
)
df.with_row_count
# =>
# shape: (3, 3)
# ┌────────┬─────┬─────┐
# │ row_nr ┆ a   ┆ b   │
# │ ---    ┆ --- ┆ --- │
# │ u32    ┆ i64 ┆ i64 │
# ╞════════╪═════╪═════╡
# │ 0      ┆ 1   ┆ 2   │
# │ 1      ┆ 3   ┆ 4   │
# │ 2      ┆ 5   ┆ 6   │
# └────────┴─────┴─────┘


1797
1798
1799
# File 'lib/polars/data_frame.rb', line 1797

def with_row_count(name: "row_nr", offset: 0)
  _from_rbdf(_df.with_row_count(name, offset))
end

#write_avro(file, compression = "uncompressed") ⇒ nil

Write to Apache Avro file.



974
975
976
977
978
979
980
981
982
983
# File 'lib/polars/data_frame.rb', line 974

def write_avro(file, compression = "uncompressed")
  if compression.nil?
    compression = "uncompressed"
  end
  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  _df.write_avro(file, compression)
end

#write_csv(file = nil, has_header: true, include_header: nil, sep: ",", quote: '"', batch_size: 1024, datetime_format: nil, date_format: nil, time_format: nil, float_precision: nil, null_value: nil) ⇒ String?

Write to comma-separated values (CSV) file.

Examples:

df = Polars::DataFrame.new(
  {
    "foo" => [1, 2, 3, 4, 5],
    "bar" => [6, 7, 8, 9, 10],
    "ham" => ["a", "b", "c", "d", "e"]
  }
)
df.write_csv("file.csv")


899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
# File 'lib/polars/data_frame.rb', line 899

def write_csv(
  file = nil,
  has_header: true,
  include_header: nil,
  sep: ",",
  quote: '"',
  batch_size: 1024,
  datetime_format: nil,
  date_format: nil,
  time_format: nil,
  float_precision: nil,
  null_value: nil
)
  include_header = has_header if include_header.nil?

  if sep.length > 1
    raise ArgumentError, "only single byte separator is allowed"
  elsif quote.length > 1
    raise ArgumentError, "only single byte quote char is allowed"
  elsif null_value == ""
    null_value = nil
  end

  if file.nil?
    buffer = StringIO.new
    buffer.set_encoding(Encoding::BINARY)
    _df.write_csv(
      buffer,
      include_header,
      sep.ord,
      quote.ord,
      batch_size,
      datetime_format,
      date_format,
      time_format,
      float_precision,
      null_value
    )
    return buffer.string.force_encoding(Encoding::UTF_8)
  end

  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  _df.write_csv(
    file,
    include_header,
    sep.ord,
    quote.ord,
    batch_size,
    datetime_format,
    date_format,
    time_format,
    float_precision,
    null_value,
  )
  nil
end

#write_ipc(file, compression: "uncompressed") ⇒ nil

Write to Arrow IPC binary stream or Feather file.



993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
# File 'lib/polars/data_frame.rb', line 993

def write_ipc(file, compression: "uncompressed")
  return_bytes = file.nil?
  if return_bytes
    file = StringIO.new
    file.set_encoding(Encoding::BINARY)
  end
  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  if compression.nil?
    compression = "uncompressed"
  end

  _df.write_ipc(file, compression)
  return_bytes ? file.string : nil
end

#write_json(file, pretty: false, row_oriented: false) ⇒ nil

Serialize to JSON representation.

See Also:



827
828
829
830
831
832
833
834
835
836
837
838
# File 'lib/polars/data_frame.rb', line 827

def write_json(
  file,
  pretty: false,
  row_oriented: false
)
  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  _df.write_json(file, pretty, row_oriented)
  nil
end

#write_ndjson(file) ⇒ nil

Serialize to newline delimited JSON representation.



846
847
848
849
850
851
852
853
# File 'lib/polars/data_frame.rb', line 846

def write_ndjson(file)
  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  _df.write_ndjson(file)
  nil
end

#write_parquet(file, compression: "zstd", compression_level: nil, statistics: false, row_group_size: nil) ⇒ nil

Write to Apache Parquet file.



1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
# File 'lib/polars/data_frame.rb', line 1036

def write_parquet(
  file,
  compression: "zstd",
  compression_level: nil,
  statistics: false,
  row_group_size: nil
)
  if compression.nil?
    compression = "uncompressed"
  end
  if Utils.pathlike?(file)
    file = Utils.normalise_filepath(file)
  end

  _df.write_parquet(
    file, compression, compression_level, statistics, row_group_size
  )
end