Class: GPT2LM

Inherits:

Object

Object
GPT2LM

show all

Defined in:: lib/toy/models/gpt2.rb

Instance Attribute Summary collapse

#context_length ⇒ Object

Returns the value of attribute context_length.
#d_ff ⇒ Object

Returns the value of attribute d_ff.
#d_head ⇒ Object

Returns the value of attribute d_head.
#d_model ⇒ Object

Returns the value of attribute d_model.
#gpt2_blocks ⇒ Object

Returns the value of attribute gpt2_blocks.
#ln_eps ⇒ Object

Returns the value of attribute ln_eps.
#ln_f_beta ⇒ Object

Returns the value of attribute ln_f_beta.
#ln_f_gamma ⇒ Object

Returns the value of attribute ln_f_gamma.
#n_heads ⇒ Object

Returns the value of attribute n_heads.
#n_layers ⇒ Object

Returns the value of attribute n_layers.
#pos_embed ⇒ Object

Returns the value of attribute pos_embed.
#token_embed ⇒ Object

Returns the value of attribute token_embed.
#vocab_size ⇒ Object

Returns the value of attribute vocab_size.

Instance Method Summary collapse

#add_bias!(x, bias) ⇒ Object

Broadcast-add a length-d row bias to every row of x, in-place.
#apply_causal_mask!(scores) ⇒ Object

Causal mask: for each row i, set scores[i, j] = -1e30 for j > i.
#embed(token_ids, start_pos) ⇒ Object

x[i, :] = token_embed[token_ids] + pos_embed[start_pos + i].
#feed_forward(h, block) ⇒ Object

FFN: gelu_new(h · W_ff1 + b_ff1) · W_ff2 + b_ff2.
#forward(token_ids, start_pos) ⇒ Object

embed -> N blocks -> final LN -> tied unembed -> logits[T, vocab].
#hstack_heads(per_head) ⇒ Object

n_heads × (T × d_head) → (T × d_model), packing each head’s columns back into the contiguous block of width d_head at offset h*d_head.
#initialize(vocab_size, d_model, d_ff, n_heads, n_layers, context_length) ⇒ GPT2LM constructor

A new instance of GPT2LM.
#layer_norm(x, gamma, beta) ⇒ Object

LayerNorm: y_j = (x_j - mean) / sqrt(var + eps) * gamma_j + beta_j per row.
#self_attention(h_in, block) ⇒ Object
#self_attention_head(h_in, block, head_idx, inv_sqrt) ⇒ Object
#softmax_rows!(m) ⇒ Object

Row-wise softmax with numerical-stability max-shift, in place on m.
#transformer_block(x, block) ⇒ Object

One transformer block: pre-LN → MHA → residual → pre-LN → FFN → residual.

Constructor Details

#initialize(vocab_size, d_model, d_ff, n_heads, n_layers, context_length) ⇒ `GPT2LM`

Returns a new instance of GPT2LM.

# File 'lib/toy/models/gpt2.rb', line 69

def initialize(vocab_size, d_model, d_ff, n_heads, n_layers, context_length)
  @vocab_size     = vocab_size
  @d_model        = d_model
  @d_ff           = d_ff
  @n_heads        = n_heads
  @d_head         = d_model / n_heads
  @n_layers       = n_layers
  @context_length = context_length
  @ln_eps         = RMS_EPS_DEFAULT  # LayerNorm epsilon; same value

  @token_embed = Mat.new(vocab_size, d_model)
  @pos_embed   = Mat.new(context_length, d_model)
  @ln_f_gamma  = Array.new(d_model, 1.0)
  @ln_f_beta   = Array.new(d_model, 0.0)

  @gpt2_blocks = [GPT2Block.new(d_model, @d_head, d_ff, n_heads)]
  li = 1
  while li < n_layers
    @gpt2_blocks.push(GPT2Block.new(d_model, @d_head, d_ff, n_heads))
    li += 1
  end
end

Instance Attribute Details

#context_length ⇒ `Object`

Returns the value of attribute context_length.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def context_length
  @context_length
end

#d_ff ⇒ `Object`

Returns the value of attribute d_ff.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def d_ff
  @d_ff
end

#d_head ⇒ `Object`

Returns the value of attribute d_head.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def d_head
  @d_head
end

#d_model ⇒ `Object`

Returns the value of attribute d_model.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def d_model
  @d_model
end

#gpt2_blocks ⇒ `Object`

Returns the value of attribute gpt2_blocks.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def gpt2_blocks
  @gpt2_blocks
end

#ln_eps ⇒ `Object`

Returns the value of attribute ln_eps.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def ln_eps
  @ln_eps
end

#ln_f_beta ⇒ `Object`

Returns the value of attribute ln_f_beta.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def ln_f_beta
  @ln_f_beta
end

#ln_f_gamma ⇒ `Object`

Returns the value of attribute ln_f_gamma.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def ln_f_gamma
  @ln_f_gamma
end

#n_heads ⇒ `Object`

Returns the value of attribute n_heads.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def n_heads
  @n_heads
end

#n_layers ⇒ `Object`

Returns the value of attribute n_layers.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def n_layers
  @n_layers
end

#pos_embed ⇒ `Object`

Returns the value of attribute pos_embed.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def pos_embed
  @pos_embed
end

#token_embed ⇒ `Object`

Returns the value of attribute token_embed.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def token_embed
  @token_embed
end

#vocab_size ⇒ `Object`

Returns the value of attribute vocab_size.



64
65
66

# File 'lib/toy/models/gpt2.rb', line 64

def vocab_size
  @vocab_size
end

Instance Method Details

#add_bias!(x, bias) ⇒ `Object`

Broadcast-add a length-d row bias to every row of x, in-place.

# File 'lib/toy/models/gpt2.rb', line 146

def add_bias!(x, bias)
  d = x.ncols
  t = x.nrows
  i = 0
  while i < t
    j = 0
    while j < d
      x.flat[i * d + j] = x.flat[i * d + j] + bias[j]
      j += 1
    end
    i += 1
  end
end

#apply_causal_mask!(scores) ⇒ `Object`

Causal mask: for each row i, set scores[i, j] = -1e30 for j > i.

# File 'lib/toy/models/gpt2.rb', line 194

def apply_causal_mask!(scores)
  t = scores.nrows
  n = scores.ncols
  i = 0
  while i < t
    j = i + 1
    while j < n
      scores.flat[i * n + j] = -1.0e30
      j += 1
    end
    i += 1
  end
end

#embed(token_ids, start_pos) ⇒ `Object`

x[i, :] = token_embed[token_ids] + pos_embed[start_pos + i]

# File 'lib/toy/models/gpt2.rb', line 93

def embed(token_ids, start_pos)
  t = token_ids.length
  out = Mat.new(t, @d_model)
  i = 0
  while i < t
    tid = token_ids[i]
    j = 0
    while j < @d_model
      out.flat[i * @d_model + j] =
        @token_embed.flat[tid * @d_model + j] +
        @pos_embed.flat[(start_pos + i) * @d_model + j]
      j += 1
    end
    i += 1
  end
  out
end

#feed_forward(h, block) ⇒ `Object`

FFN: gelu_new(h · W_ff1 + b_ff1) · W_ff2 + b_ff2. GeLU is the tanh approximation; constants GELU_C / GELU_K live in lib/transformer.rb.

# File 'lib/toy/models/gpt2.rb', line 266

def feed_forward(h, block)
  pre = h.matmul(block.w_ff1)
  add_bias!(pre, block.b_ff1)
  hidden = Mat.new(pre.nrows, pre.ncols)
  n = pre.nrows * pre.ncols
  i = 0
  while i < n
    x = pre.flat[i]
    u = GELU_C * (x + GELU_K * x * x * x)
    hidden.flat[i] = 0.5 * x * (1.0 + Math.tanh(u))
    i += 1
  end
  out = hidden.matmul(block.w_ff2)
  add_bias!(out, block.b_ff2)
  out
end

#forward(token_ids, start_pos) ⇒ `Object`

embed -> N blocks -> final LN -> tied unembed -> logits[T, vocab]

# File 'lib/toy/models/gpt2.rb', line 298

def forward(token_ids, start_pos)
  x = embed(token_ids, start_pos)

  li = 0
  while li < @n_layers
    x = transformer_block(x, @gpt2_blocks[li])
    li += 1
  end

  x_final = layer_norm(x, @ln_f_gamma, @ln_f_beta)
  # logits = x_final · token_embedᵀ  (tied output embedding)
  x_final.matmul_t(@token_embed)
end

#hstack_heads(per_head) ⇒ `Object`

n_heads × (T × d_head) → (T × d_model), packing each head’s columns back into the contiguous block of width d_head at offset h*d_head.

# File 'lib/toy/models/gpt2.rb', line 210

def hstack_heads(per_head)
  t = per_head[0].nrows
  out = Mat.new(t, @d_model)
  h = 0
  while h < @n_heads
    head = per_head[h]
    base = h * @d_head
    i = 0
    while i < t
      j = 0
      while j < @d_head
        out.flat[i * @d_model + (base + j)] = head.flat[i * @d_head + j]
        j += 1
      end
      i += 1
    end
    h += 1
  end
  out
end

#layer_norm(x, gamma, beta) ⇒ `Object`

LayerNorm: y_j = (x_j - mean) / sqrt(var + eps) * gamma_j + beta_j per row. New Mat returned (caller may need x unchanged for residual).

# File 'lib/toy/models/gpt2.rb', line 113

def layer_norm(x, gamma, beta)
  d = gamma.length
  t = x.nrows
  out = Mat.new(t, d)
  i = 0
  while i < t
    sum = 0.0
    j = 0
    while j < d
      sum = sum + x.flat[i * d + j]
      j += 1
    end
    mean = sum / d
    sumsq = 0.0
    j = 0
    while j < d
      v = x.flat[i * d + j] - mean
      sumsq = sumsq + v * v
      j += 1
    end
    inv = 1.0 / Math.sqrt(sumsq / d + @ln_eps)
    j = 0
    while j < d
      n = (x.flat[i * d + j] - mean) * inv
      out.flat[i * d + j] = n * gamma[j] + beta[j]
      j += 1
    end
    i += 1
  end
  out
end

#self_attention(h_in, block) ⇒ `Object`

# File 'lib/toy/models/gpt2.rb', line 246

def self_attention(h_in, block)
  inv_sqrt = 1.0 / Math.sqrt(@d_head)

  head0 = self_attention_head(h_in, block, 0, inv_sqrt)
  per_head = [head0]
  hi = 1
  while hi < @n_heads
    per_head.push(self_attention_head(h_in, block, hi, inv_sqrt))
    hi += 1
  end

  concat = hstack_heads(per_head)
  proj = concat.matmul(block.w_o)
  add_bias!(proj, block.b_o)
  proj
end

#self_attention_head(h_in, block, head_idx, inv_sqrt) ⇒ `Object`

# File 'lib/toy/models/gpt2.rb', line 231

def self_attention_head(h_in, block, head_idx, inv_sqrt)
  q = h_in.matmul(block.w_q[head_idx])
  add_bias!(q, block.b_q[head_idx])
  k = h_in.matmul(block.w_k[head_idx])
  add_bias!(k, block.b_k[head_idx])
  v = h_in.matmul(block.w_v[head_idx])
  add_bias!(v, block.b_v[head_idx])

  scores = q.matmul_t(k)
  scores.scale!(inv_sqrt)
  apply_causal_mask!(scores)
  softmax_rows!(scores)
  scores.matmul(v)
end

#softmax_rows!(m) ⇒ `Object`

Row-wise softmax with numerical-stability max-shift, in place on m.

# File 'lib/toy/models/gpt2.rb', line 161

def softmax_rows!(m)
  t = m.nrows
  n = m.ncols
  i = 0
  while i < t
    base = i * n
    mx = m.flat[base]
    j = 1
    while j < n
      v = m.flat[base + j]
      if v > mx
        mx = v
      end
      j += 1
    end
    sum = 0.0
    j = 0
    while j < n
      e = Math.exp(m.flat[base + j] - mx)
      m.flat[base + j] = e
      sum = sum + e
      j += 1
    end
    j = 0
    while j < n
      m.flat[base + j] = m.flat[base + j] / sum
      j += 1
    end
    i += 1
  end
end

#transformer_block(x, block) ⇒ `Object`

One transformer block: pre-LN → MHA → residual → pre-LN → FFN → residual. x is mutated in place via add!; returned for chaining.

# File 'lib/toy/models/gpt2.rb', line 285

def transformer_block(x, block)
  h_norm  = layer_norm(x, block.ln1_gamma, block.ln1_beta)
  attn    = self_attention(h_norm, block)
  x.add!(attn)

  h_norm2 = layer_norm(x, block.ln2_gamma, block.ln2_beta)
  ff      = feed_forward(h_norm2, block)
  x.add!(ff)

  x
end

Class: GPT2LM

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(vocab_size, d_model, d_ff, n_heads, n_layers, context_length) ⇒ GPT2LM

Instance Attribute Details

#context_length ⇒ Object

#d_ff ⇒ Object

#d_head ⇒ Object

#d_model ⇒ Object

#gpt2_blocks ⇒ Object

#ln_eps ⇒ Object

#ln_f_beta ⇒ Object

#ln_f_gamma ⇒ Object

#n_heads ⇒ Object

#n_layers ⇒ Object

#pos_embed ⇒ Object

#token_embed ⇒ Object

#vocab_size ⇒ Object