Class: NanoGPT::DataLoader

Inherits:
Object
  • Object
show all
Defined in:
lib/nano_gpt/data_loader.rb

Overview

Loads batches from binary token files Memory-efficient: reads from file each batch (like Python’s memmap recreation)

Constant Summary collapse

BYTES_PER_TOKEN =

uint16

2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data_dir:, block_size:, batch_size:, device: "cpu") ⇒ DataLoader

Returns a new instance of DataLoader.



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/nano_gpt/data_loader.rb', line 11

def initialize(data_dir:, block_size:, batch_size:, device: "cpu")
  @data_dir = data_dir
  @block_size = block_size
  @batch_size = batch_size
  @device = device

  # Store file paths and sizes (NOT the data itself)
  @train_path = File.join(data_dir, "train.bin")
  @val_path = File.join(data_dir, "val.bin")

  @train_size = File.size(@train_path) / BYTES_PER_TOKEN
  @val_size = File.size(@val_path) / BYTES_PER_TOKEN
end

Instance Attribute Details

#batch_sizeObject (readonly)

Returns the value of attribute batch_size.



7
8
9
# File 'lib/nano_gpt/data_loader.rb', line 7

def batch_size
  @batch_size
end

#block_sizeObject (readonly)

Returns the value of attribute block_size.



7
8
9
# File 'lib/nano_gpt/data_loader.rb', line 7

def block_size
  @block_size
end

Instance Method Details

#get_batch(split) ⇒ Object

Get a batch of data Memory-efficient: recreates data view per batch to avoid memory leak (matches Python’s memmap recreation pattern)



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/nano_gpt/data_loader.rb', line 36

def get_batch(split)
  path = split == :train ? @train_path : @val_path
  data_size = split == :train ? @train_size : @val_size

  # Random starting indices
  max_start = data_size - @block_size - 1
  indices = Array.new(@batch_size) { rand(0..max_start) }

  # Read only the bytes we need from file (memory-efficient)
  # This mimics Python's memmap recreation per batch
  x_arrays = []
  y_arrays = []

  File.open(path, "rb") do |f|
    indices.each do |i|
      # Read x: tokens[i:i+block_size]
      f.seek(i * BYTES_PER_TOKEN)
      x_bytes = f.read((@block_size + 1) * BYTES_PER_TOKEN)
      tokens = x_bytes.unpack("S<*")  # uint16 little-endian

      x_arrays << tokens[0...@block_size]
      y_arrays << tokens[1..@block_size]
    end
  end

  # Create tensors directly from arrays (avoiding Numo intermediate)
  x = Torch.tensor(x_arrays, dtype: :long)
  y = Torch.tensor(y_arrays, dtype: :long)

  # Move to device (CPU, CUDA, or MPS)
  if @device != "cpu"
    x = x.to(@device)
    y = y.to(@device)
  end

  [x, y]
end

#train_sizeObject



25
26
27
# File 'lib/nano_gpt/data_loader.rb', line 25

def train_size
  @train_size
end

#val_sizeObject



29
30
31
# File 'lib/nano_gpt/data_loader.rb', line 29

def val_size
  @val_size
end