Class: NanoGPT::Layers::CausalSelfAttention

Inherits:

Torch::NN::Module

Object
Torch::NN::Module
NanoGPT::Layers::CausalSelfAttention

show all

Defined in:: lib/nano_gpt/layers/causal_self_attention.rb

Overview

Multi-head causal self-attention

Instance Method Summary collapse

#forward(x) ⇒ Object
#initialize(config) ⇒ CausalSelfAttention constructor

A new instance of CausalSelfAttention.

Constructor Details

#initialize(config) ⇒ `CausalSelfAttention`

Returns a new instance of CausalSelfAttention.

Raises:

(ArgumentError)

# File 'lib/nano_gpt/layers/causal_self_attention.rb', line 7

def initialize(config)
  super()
  raise ArgumentError, "n_embd must be divisible by n_head" unless (config.n_embd % config.n_head).zero?

  @n_head = config.n_head
  @n_embd = config.n_embd
  @head_size = config.n_embd / config.n_head
  @dropout_p = config.dropout

  # Key, query, value projections for all heads, combined
  @c_attn = Torch::NN::Linear.new(config.n_embd, 3 * config.n_embd, bias: config.bias)
  # Output projection
  @c_proj = Torch::NN::Linear.new(config.n_embd, config.n_embd, bias: config.bias)
  # Regularization
  @attn_dropout = Torch::NN::Dropout.new(p: config.dropout)
  @resid_dropout = Torch::NN::Dropout.new(p: config.dropout)

  # Use native scaled_dot_product_attention with is_causal=true when dropout is 0
  # Native SDPA is ~5x faster but doesn't support dropout with is_causal mode
  @flash = config.dropout == 0.0

  # Causal mask for manual attention (only used when @flash is false)
  unless @flash
    mask = Torch.tril(Torch.ones(config.block_size, config.block_size))
    register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
  end
end

Instance Method Details

#forward(x) ⇒ `Object`

# File 'lib/nano_gpt/layers/causal_self_attention.rb', line 35

def forward(x)
  b, t, c = x.shape

  # Calculate Q, K, V
  qkv = @c_attn.call(x)
  q, k, v = qkv.split(@n_embd, 2)

  # Reshape: (B, T, C) -> (B, nh, T, hs)
  q = q.view(b, t, @n_head, @head_size).transpose(1, 2)
  k = k.view(b, t, @n_head, @head_size).transpose(1, 2)
  v = v.view(b, t, @n_head, @head_size).transpose(1, 2)

  y = if @flash
        # Native scaled_dot_product_attention with is_causal=true
        # Uses Flash Attention on CUDA, optimized kernel on MPS
        Torch::NN.scaled_dot_product_attention(q, k, v, nil, 0.0, true)
      else
        # Manual attention implementation with causal mask
        scale = 1.0 / Math.sqrt(@head_size)
        att = q.matmul(k.transpose(-2, -1))
        att.mul!(scale)

        # Apply causal mask - slice mask to current sequence length
        mask_slice = @mask.narrow(2, 0, t).narrow(3, 0, t)
        att.masked_fill!(mask_slice.eq(0), -Float::INFINITY)
        att = Torch::NN::Functional.softmax(att, dim: -1)
        att = @attn_dropout.call(att)
        att.matmul(v)
      end

  # Reassemble heads: (B, nh, T, hs) -> (B, T, C)
  y = y.transpose(1, 2).contiguous.view(b, t, c)

  # Output projection
  @resid_dropout.call(@c_proj.call(y))
end

Class: NanoGPT::Layers::CausalSelfAttention

Overview

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ CausalSelfAttention

Instance Method Details

#forward(x) ⇒ Object

#initialize(config) ⇒ `CausalSelfAttention`

#forward(x) ⇒ `Object`