Chapter 3: Anatomy of an LLM
Practical Exercises – Chapter 3
These exercises help you internalize multi-head attention, rotary embeddings (RoPE), normalization strategies, depth vs width, ALiBi, SwiGLU, GQA, and attention sparsity.
Exercise 1 — Scaled Dot-Product Attention (single head)
Task: Implement single-head scaled dot-product attention with an optional causal mask.
Solution:
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(q, k, v, causal=False):
# q,k,v: [B, T, D]
d = q.size(-1)
scores = q @ k.transpose(-2, -1) / (d ** 0.5) # [B, T, T]
if causal:
T = scores.size(-1)
mask = torch.triu(torch.ones(T, T, device=scores.device), diagonal=1).bool()
scores = scores.masked_fill(mask, float('-inf'))
weights = F.softmax(scores, dim=-1) # [B, T, T]
return weights @ v, weights # [B, T, D], [B, T, T]
# Quick check
B,T,D = 1,5,16
x = torch.randn(B,T,D)
y,_ = scaled_dot_product_attention(x,x,x,causal=True)
print(y.shape) # torch.Size([1,5,16])Exercise 2 — Minimal Multi-Head Self-Attention
Task: Build a tiny multi-head attention layer. Confirm output shapes.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class MiniMHA(nn.Module):
def __init__(self, embed_dim=32, num_heads=4):
super().__init__()
assert embed_dim % num_heads == 0
self.h = num_heads
self.d = embed_dim // num_heads
self.q = nn.Linear(embed_dim, embed_dim)
self.k = nn.Linear(embed_dim, embed_dim)
self.v = nn.Linear(embed_dim, embed_dim)
self.o = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.h,self.d).transpose(1,2) # [B,h,T,d]
k = self.k(x).view(B,T,self.h,self.d).transpose(1,2)
v = self.v(x).view(B,T,self.h,self.d).transpose(1,2)
scores = (q @ k.transpose(-2,-1)) / (self.d**0.5) # [B,h,T,T]
w = F.softmax(scores, dim=-1)
out = (w @ v).transpose(1,2).contiguous().view(B,T,C) # [B,T,C]
return self.o(out)
x = torch.randn(2, 6, 32)
mha = MiniMHA(32, 4)
print(mha(x).shape) # torch.Size([2, 6, 32])Exercise 3 — Add RoPE to Q/K and compare attention
Task: Apply a simple rotary position embedding to Q and K, then compare attention weights with/without RoPE.
Solution:
import torch, math, torch.nn.functional as F
def apply_rope(t, base=10000.0):
# t: [B,h,T,d]; assume d is even
B,h,T,d = t.shape
half = d//2
idx = torch.arange(half, device=t.device)
theta = (1.0 / (base ** (idx/half))).view(1,1,1,half) # [1,1,1,half]
pos = torch.arange(T, device=t.device).view(1,1,T,1)
angles = pos * theta # [1,1,T,half]
sin, cos = torch.sin(angles), torch.cos(angles)
t1, t2 = t[..., :half], t[..., half:]
return torch.cat([t1*cos - t2*sin, t1*sin + t2*cos], dim=-1)
# Compare weights
B,h,T,d = 1, 2, 6, 8
q = torch.randn(B,h,T,d)
k = torch.randn(B,h,T,d)
# vanilla
w0 = F.softmax((q @ k.transpose(-2,-1))/ (d**0.5), dim=-1)
# with RoPE
q_r = apply_rope(q); k_r = apply_rope(k)
w1 = F.softmax((q_r @ k_r.transpose(-2,-1))/ (d**0.5), dim=-1)
print("Δ mean:", (w1 - w0).abs().mean().item())Exercise 4 — Implement ALiBi bias and plug into attention
Task: Create an ALiBi bias tensor and add it to attention logits before softmax.
Solution:
import torch, torch.nn.functional as F
def alibi_bias(seq_len, num_heads):
# simple geometric head slopes (one of several choices in the wild)
slopes = torch.tensor([1.0 / (2 ** (i/num_heads)) for i in range(num_heads)])
i = torch.arange(seq_len).unsqueeze(1)
j = torch.arange(seq_len).unsqueeze(0)
dist = (i - j).abs().float() # [T,T]
bias = -dist.unsqueeze(0) * slopes.view(num_heads,1,1) # [h,T,T]
return bias
B,h,T,d = 1, 4, 8, 16
q = torch.randn(B,h,T,d); k = torch.randn(B,h,T,d); v = torch.randn(B,h,T,d)
scores = (q @ k.transpose(-2,-1)) / (d**0.5) # [B,h,T,T]
bias = alibi_bias(T, h).to(scores.device).unsqueeze(0) # [1,h,T,T]
w = F.softmax(scores + bias, dim=-1)
out = w @ v
print(out.shape) # torch.Size([1,4,8,16])Exercise 5 — LayerNorm vs RMSNorm stability check
Task: Compare LayerNorm and RMSNorm on random activations and inspect their statistics.
Solution:
import torch, torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-8):
super().__init__()
self.scale = nn.Parameter(torch.ones(dim))
self.eps = eps
def forward(self, x):
# normalize by root-mean-square over features
rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()
return self.scale * (x / (rms + self.eps))
x = torch.randn(64, 32) * 3.0 # "messy" activations
ln = nn.LayerNorm(32)
rms = RMSNorm(32)
y_ln = ln(x); y_rms = rms(x)
print(y_ln.mean().item(), y_ln.std().item())
print(y_rms.mean().item(), y_rms.std().item())Exercise 6 — Depth vs Width: parameter counts & quick forward
Task: Build a deep-narrow and a shallow-wide encoder using PyTorch’s TransformerEncoderLayer. Compare parameter counts and verify a forward pass.
Solution:
import torch, torch.nn as nn
def count_params(m): return sum(p.numel() for p in m.parameters())
deep_narrow = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=256, nhead=4, dim_feedforward=1024, batch_first=True),
num_layers=24
)
shallow_wide = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=1024, nhead=16, dim_feedforward=4096, batch_first=True),
num_layers=6
)
print("Deep-Narrow params:", count_params(deep_narrow))
print("Shallow-Wide params:", count_params(shallow_wide))
x_dn = torch.randn(2, 32, 256)
x_sw = torch.randn(2, 32, 1024)
print(deep_narrow(x_dn).shape, shallow_wide(x_sw).shape)Exercise 7 — SwiGLU feed-forward vs GELU
Task: Implement a SwiGLU FFN and compare output statistics with a standard GELU FFN.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class SwiGLU(nn.Module):
def __init__(self, d_in, d_hidden, d_out):
super().__init__()
self.w1 = nn.Linear(d_in, d_hidden)
self.w2 = nn.Linear(d_in, d_hidden)
self.proj = nn.Linear(d_hidden, d_out)
def forward(self, x):
return self.proj(F.silu(self.w1(x)) * self.w2(x))
gelu_ffn = nn.Sequential(nn.Linear(256, 1024), nn.GELU(), nn.Linear(1024, 256))
swiglu_ffn = SwiGLU(256, 1024, 256)
x = torch.randn(8, 16, 256)
y_gelu = gelu_ffn(x); y_swiglu = swiglu_ffn(x)
print(y_gelu.std().item(), y_swiglu.std().item())Exercise 8 — GQA: share K/V across query heads
Task: Create grouped-query attention projections where many Q heads share fewer K/V heads. Verify head mapping.
Solution:
import torch, torch.nn as nn
class SimpleGQA(nn.Module):
def __init__(self, d_model=64, q_heads=8, kv_heads=2):
super().__init__()
assert d_model % q_heads == 0
self.qh, self.kvh = q_heads, kv_heads
self.d = d_model // q_heads
self.q = nn.Linear(d_model, d_model)
self.k = nn.Linear(d_model, self.d * kv_heads)
self.v = nn.Linear(d_model, self.d * kv_heads)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.qh,self.d).transpose(1,2) # [B,qh,T,d]
k = self.k(x).view(B,T,self.kvh,self.d).transpose(1,2) # [B,kvh,T,d]
v = self.v(x).view(B,T,self.kvh,self.d).transpose(1,2)
# expand kv to match q heads by repeating each kv head equally
group = self.qh // self.kvh
k = k.repeat_interleave(group, dim=1) # [B,qh,T,d]
v = v.repeat_interleave(group, dim=1)
return q,k,v
x = torch.randn(1, 10, 64)
q,k,v = SimpleGQA()(x)
print(q.shape, k.shape, v.shape) # torch.Size([1,8,10,8]) eachExercise 9 — Local (sparse) attention window vs full attention
Task: Implement a sliding-window local attention and compare its output shape with full attention. (We’ll skip timing to keep it lightweight.)
Solution:
import torch, torch.nn.functional as F
def local_attention(x, window=2):
# x: [B,T,D]
B,T,D = x.shape
outs = []
for i in range(T):
s = max(0, i - window); e = min(T, i + window + 1)
ctx = x[:, s:e, :] # [B,W,D]
w = F.softmax(torch.bmm(x[:, i:i+1, :], ctx.transpose(1,2)), dim=-1) # [B,1,W]
outs.append(torch.bmm(w, ctx)) # [B,1,D]
return torch.cat(outs, dim=1) # [B,T,D]
B,T,D = 1, 12, 32
x = torch.randn(B,T,D)
y_local = local_attention(x, window=3)
print(y_local.shape) # torch.Size([1,12,32])Exercise 10 — (Concept) Choosing RoPE vs ALiBi
Task (no code):
You’re building a long-context summarizer (up to 64k tokens). You want robust generalization beyond training context and efficient inference. Which would you choose, RoPE or ALiBi, and why?
Sample answer:
I would start with RoPE because it encodes relative positions directly in Q/K and tends to extrapolate gracefully to longer contexts. If latency/memory becomes a concern or if experiments show better empirical stability, I would try ALiBi, which adds a linear distance bias to logits and remains simple to scale.
What you’ve practiced
- Building attention from scratch and visualizing the effect of RoPE/ALiBi.
- Comparing LayerNorm vs RMSNorm and understanding why pre-norm stacks help deep transformers.
- Seeing how depth vs width changes parameter counts and behavior.
- Implementing SwiGLU, GQA, and local (sparse) attention patterns that power modern LLMs.
Practical Exercises – Chapter 3
These exercises help you internalize multi-head attention, rotary embeddings (RoPE), normalization strategies, depth vs width, ALiBi, SwiGLU, GQA, and attention sparsity.
Exercise 1 — Scaled Dot-Product Attention (single head)
Task: Implement single-head scaled dot-product attention with an optional causal mask.
Solution:
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(q, k, v, causal=False):
# q,k,v: [B, T, D]
d = q.size(-1)
scores = q @ k.transpose(-2, -1) / (d ** 0.5) # [B, T, T]
if causal:
T = scores.size(-1)
mask = torch.triu(torch.ones(T, T, device=scores.device), diagonal=1).bool()
scores = scores.masked_fill(mask, float('-inf'))
weights = F.softmax(scores, dim=-1) # [B, T, T]
return weights @ v, weights # [B, T, D], [B, T, T]
# Quick check
B,T,D = 1,5,16
x = torch.randn(B,T,D)
y,_ = scaled_dot_product_attention(x,x,x,causal=True)
print(y.shape) # torch.Size([1,5,16])Exercise 2 — Minimal Multi-Head Self-Attention
Task: Build a tiny multi-head attention layer. Confirm output shapes.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class MiniMHA(nn.Module):
def __init__(self, embed_dim=32, num_heads=4):
super().__init__()
assert embed_dim % num_heads == 0
self.h = num_heads
self.d = embed_dim // num_heads
self.q = nn.Linear(embed_dim, embed_dim)
self.k = nn.Linear(embed_dim, embed_dim)
self.v = nn.Linear(embed_dim, embed_dim)
self.o = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.h,self.d).transpose(1,2) # [B,h,T,d]
k = self.k(x).view(B,T,self.h,self.d).transpose(1,2)
v = self.v(x).view(B,T,self.h,self.d).transpose(1,2)
scores = (q @ k.transpose(-2,-1)) / (self.d**0.5) # [B,h,T,T]
w = F.softmax(scores, dim=-1)
out = (w @ v).transpose(1,2).contiguous().view(B,T,C) # [B,T,C]
return self.o(out)
x = torch.randn(2, 6, 32)
mha = MiniMHA(32, 4)
print(mha(x).shape) # torch.Size([2, 6, 32])Exercise 3 — Add RoPE to Q/K and compare attention
Task: Apply a simple rotary position embedding to Q and K, then compare attention weights with/without RoPE.
Solution:
import torch, math, torch.nn.functional as F
def apply_rope(t, base=10000.0):
# t: [B,h,T,d]; assume d is even
B,h,T,d = t.shape
half = d//2
idx = torch.arange(half, device=t.device)
theta = (1.0 / (base ** (idx/half))).view(1,1,1,half) # [1,1,1,half]
pos = torch.arange(T, device=t.device).view(1,1,T,1)
angles = pos * theta # [1,1,T,half]
sin, cos = torch.sin(angles), torch.cos(angles)
t1, t2 = t[..., :half], t[..., half:]
return torch.cat([t1*cos - t2*sin, t1*sin + t2*cos], dim=-1)
# Compare weights
B,h,T,d = 1, 2, 6, 8
q = torch.randn(B,h,T,d)
k = torch.randn(B,h,T,d)
# vanilla
w0 = F.softmax((q @ k.transpose(-2,-1))/ (d**0.5), dim=-1)
# with RoPE
q_r = apply_rope(q); k_r = apply_rope(k)
w1 = F.softmax((q_r @ k_r.transpose(-2,-1))/ (d**0.5), dim=-1)
print("Δ mean:", (w1 - w0).abs().mean().item())Exercise 4 — Implement ALiBi bias and plug into attention
Task: Create an ALiBi bias tensor and add it to attention logits before softmax.
Solution:
import torch, torch.nn.functional as F
def alibi_bias(seq_len, num_heads):
# simple geometric head slopes (one of several choices in the wild)
slopes = torch.tensor([1.0 / (2 ** (i/num_heads)) for i in range(num_heads)])
i = torch.arange(seq_len).unsqueeze(1)
j = torch.arange(seq_len).unsqueeze(0)
dist = (i - j).abs().float() # [T,T]
bias = -dist.unsqueeze(0) * slopes.view(num_heads,1,1) # [h,T,T]
return bias
B,h,T,d = 1, 4, 8, 16
q = torch.randn(B,h,T,d); k = torch.randn(B,h,T,d); v = torch.randn(B,h,T,d)
scores = (q @ k.transpose(-2,-1)) / (d**0.5) # [B,h,T,T]
bias = alibi_bias(T, h).to(scores.device).unsqueeze(0) # [1,h,T,T]
w = F.softmax(scores + bias, dim=-1)
out = w @ v
print(out.shape) # torch.Size([1,4,8,16])Exercise 5 — LayerNorm vs RMSNorm stability check
Task: Compare LayerNorm and RMSNorm on random activations and inspect their statistics.
Solution:
import torch, torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-8):
super().__init__()
self.scale = nn.Parameter(torch.ones(dim))
self.eps = eps
def forward(self, x):
# normalize by root-mean-square over features
rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()
return self.scale * (x / (rms + self.eps))
x = torch.randn(64, 32) * 3.0 # "messy" activations
ln = nn.LayerNorm(32)
rms = RMSNorm(32)
y_ln = ln(x); y_rms = rms(x)
print(y_ln.mean().item(), y_ln.std().item())
print(y_rms.mean().item(), y_rms.std().item())Exercise 6 — Depth vs Width: parameter counts & quick forward
Task: Build a deep-narrow and a shallow-wide encoder using PyTorch’s TransformerEncoderLayer. Compare parameter counts and verify a forward pass.
Solution:
import torch, torch.nn as nn
def count_params(m): return sum(p.numel() for p in m.parameters())
deep_narrow = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=256, nhead=4, dim_feedforward=1024, batch_first=True),
num_layers=24
)
shallow_wide = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=1024, nhead=16, dim_feedforward=4096, batch_first=True),
num_layers=6
)
print("Deep-Narrow params:", count_params(deep_narrow))
print("Shallow-Wide params:", count_params(shallow_wide))
x_dn = torch.randn(2, 32, 256)
x_sw = torch.randn(2, 32, 1024)
print(deep_narrow(x_dn).shape, shallow_wide(x_sw).shape)Exercise 7 — SwiGLU feed-forward vs GELU
Task: Implement a SwiGLU FFN and compare output statistics with a standard GELU FFN.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class SwiGLU(nn.Module):
def __init__(self, d_in, d_hidden, d_out):
super().__init__()
self.w1 = nn.Linear(d_in, d_hidden)
self.w2 = nn.Linear(d_in, d_hidden)
self.proj = nn.Linear(d_hidden, d_out)
def forward(self, x):
return self.proj(F.silu(self.w1(x)) * self.w2(x))
gelu_ffn = nn.Sequential(nn.Linear(256, 1024), nn.GELU(), nn.Linear(1024, 256))
swiglu_ffn = SwiGLU(256, 1024, 256)
x = torch.randn(8, 16, 256)
y_gelu = gelu_ffn(x); y_swiglu = swiglu_ffn(x)
print(y_gelu.std().item(), y_swiglu.std().item())Exercise 8 — GQA: share K/V across query heads
Task: Create grouped-query attention projections where many Q heads share fewer K/V heads. Verify head mapping.
Solution:
import torch, torch.nn as nn
class SimpleGQA(nn.Module):
def __init__(self, d_model=64, q_heads=8, kv_heads=2):
super().__init__()
assert d_model % q_heads == 0
self.qh, self.kvh = q_heads, kv_heads
self.d = d_model // q_heads
self.q = nn.Linear(d_model, d_model)
self.k = nn.Linear(d_model, self.d * kv_heads)
self.v = nn.Linear(d_model, self.d * kv_heads)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.qh,self.d).transpose(1,2) # [B,qh,T,d]
k = self.k(x).view(B,T,self.kvh,self.d).transpose(1,2) # [B,kvh,T,d]
v = self.v(x).view(B,T,self.kvh,self.d).transpose(1,2)
# expand kv to match q heads by repeating each kv head equally
group = self.qh // self.kvh
k = k.repeat_interleave(group, dim=1) # [B,qh,T,d]
v = v.repeat_interleave(group, dim=1)
return q,k,v
x = torch.randn(1, 10, 64)
q,k,v = SimpleGQA()(x)
print(q.shape, k.shape, v.shape) # torch.Size([1,8,10,8]) eachExercise 9 — Local (sparse) attention window vs full attention
Task: Implement a sliding-window local attention and compare its output shape with full attention. (We’ll skip timing to keep it lightweight.)
Solution:
import torch, torch.nn.functional as F
def local_attention(x, window=2):
# x: [B,T,D]
B,T,D = x.shape
outs = []
for i in range(T):
s = max(0, i - window); e = min(T, i + window + 1)
ctx = x[:, s:e, :] # [B,W,D]
w = F.softmax(torch.bmm(x[:, i:i+1, :], ctx.transpose(1,2)), dim=-1) # [B,1,W]
outs.append(torch.bmm(w, ctx)) # [B,1,D]
return torch.cat(outs, dim=1) # [B,T,D]
B,T,D = 1, 12, 32
x = torch.randn(B,T,D)
y_local = local_attention(x, window=3)
print(y_local.shape) # torch.Size([1,12,32])Exercise 10 — (Concept) Choosing RoPE vs ALiBi
Task (no code):
You’re building a long-context summarizer (up to 64k tokens). You want robust generalization beyond training context and efficient inference. Which would you choose, RoPE or ALiBi, and why?
Sample answer:
I would start with RoPE because it encodes relative positions directly in Q/K and tends to extrapolate gracefully to longer contexts. If latency/memory becomes a concern or if experiments show better empirical stability, I would try ALiBi, which adds a linear distance bias to logits and remains simple to scale.
What you’ve practiced
- Building attention from scratch and visualizing the effect of RoPE/ALiBi.
- Comparing LayerNorm vs RMSNorm and understanding why pre-norm stacks help deep transformers.
- Seeing how depth vs width changes parameter counts and behavior.
- Implementing SwiGLU, GQA, and local (sparse) attention patterns that power modern LLMs.
Practical Exercises – Chapter 3
These exercises help you internalize multi-head attention, rotary embeddings (RoPE), normalization strategies, depth vs width, ALiBi, SwiGLU, GQA, and attention sparsity.
Exercise 1 — Scaled Dot-Product Attention (single head)
Task: Implement single-head scaled dot-product attention with an optional causal mask.
Solution:
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(q, k, v, causal=False):
# q,k,v: [B, T, D]
d = q.size(-1)
scores = q @ k.transpose(-2, -1) / (d ** 0.5) # [B, T, T]
if causal:
T = scores.size(-1)
mask = torch.triu(torch.ones(T, T, device=scores.device), diagonal=1).bool()
scores = scores.masked_fill(mask, float('-inf'))
weights = F.softmax(scores, dim=-1) # [B, T, T]
return weights @ v, weights # [B, T, D], [B, T, T]
# Quick check
B,T,D = 1,5,16
x = torch.randn(B,T,D)
y,_ = scaled_dot_product_attention(x,x,x,causal=True)
print(y.shape) # torch.Size([1,5,16])Exercise 2 — Minimal Multi-Head Self-Attention
Task: Build a tiny multi-head attention layer. Confirm output shapes.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class MiniMHA(nn.Module):
def __init__(self, embed_dim=32, num_heads=4):
super().__init__()
assert embed_dim % num_heads == 0
self.h = num_heads
self.d = embed_dim // num_heads
self.q = nn.Linear(embed_dim, embed_dim)
self.k = nn.Linear(embed_dim, embed_dim)
self.v = nn.Linear(embed_dim, embed_dim)
self.o = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.h,self.d).transpose(1,2) # [B,h,T,d]
k = self.k(x).view(B,T,self.h,self.d).transpose(1,2)
v = self.v(x).view(B,T,self.h,self.d).transpose(1,2)
scores = (q @ k.transpose(-2,-1)) / (self.d**0.5) # [B,h,T,T]
w = F.softmax(scores, dim=-1)
out = (w @ v).transpose(1,2).contiguous().view(B,T,C) # [B,T,C]
return self.o(out)
x = torch.randn(2, 6, 32)
mha = MiniMHA(32, 4)
print(mha(x).shape) # torch.Size([2, 6, 32])Exercise 3 — Add RoPE to Q/K and compare attention
Task: Apply a simple rotary position embedding to Q and K, then compare attention weights with/without RoPE.
Solution:
import torch, math, torch.nn.functional as F
def apply_rope(t, base=10000.0):
# t: [B,h,T,d]; assume d is even
B,h,T,d = t.shape
half = d//2
idx = torch.arange(half, device=t.device)
theta = (1.0 / (base ** (idx/half))).view(1,1,1,half) # [1,1,1,half]
pos = torch.arange(T, device=t.device).view(1,1,T,1)
angles = pos * theta # [1,1,T,half]
sin, cos = torch.sin(angles), torch.cos(angles)
t1, t2 = t[..., :half], t[..., half:]
return torch.cat([t1*cos - t2*sin, t1*sin + t2*cos], dim=-1)
# Compare weights
B,h,T,d = 1, 2, 6, 8
q = torch.randn(B,h,T,d)
k = torch.randn(B,h,T,d)
# vanilla
w0 = F.softmax((q @ k.transpose(-2,-1))/ (d**0.5), dim=-1)
# with RoPE
q_r = apply_rope(q); k_r = apply_rope(k)
w1 = F.softmax((q_r @ k_r.transpose(-2,-1))/ (d**0.5), dim=-1)
print("Δ mean:", (w1 - w0).abs().mean().item())Exercise 4 — Implement ALiBi bias and plug into attention
Task: Create an ALiBi bias tensor and add it to attention logits before softmax.
Solution:
import torch, torch.nn.functional as F
def alibi_bias(seq_len, num_heads):
# simple geometric head slopes (one of several choices in the wild)
slopes = torch.tensor([1.0 / (2 ** (i/num_heads)) for i in range(num_heads)])
i = torch.arange(seq_len).unsqueeze(1)
j = torch.arange(seq_len).unsqueeze(0)
dist = (i - j).abs().float() # [T,T]
bias = -dist.unsqueeze(0) * slopes.view(num_heads,1,1) # [h,T,T]
return bias
B,h,T,d = 1, 4, 8, 16
q = torch.randn(B,h,T,d); k = torch.randn(B,h,T,d); v = torch.randn(B,h,T,d)
scores = (q @ k.transpose(-2,-1)) / (d**0.5) # [B,h,T,T]
bias = alibi_bias(T, h).to(scores.device).unsqueeze(0) # [1,h,T,T]
w = F.softmax(scores + bias, dim=-1)
out = w @ v
print(out.shape) # torch.Size([1,4,8,16])Exercise 5 — LayerNorm vs RMSNorm stability check
Task: Compare LayerNorm and RMSNorm on random activations and inspect their statistics.
Solution:
import torch, torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-8):
super().__init__()
self.scale = nn.Parameter(torch.ones(dim))
self.eps = eps
def forward(self, x):
# normalize by root-mean-square over features
rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()
return self.scale * (x / (rms + self.eps))
x = torch.randn(64, 32) * 3.0 # "messy" activations
ln = nn.LayerNorm(32)
rms = RMSNorm(32)
y_ln = ln(x); y_rms = rms(x)
print(y_ln.mean().item(), y_ln.std().item())
print(y_rms.mean().item(), y_rms.std().item())Exercise 6 — Depth vs Width: parameter counts & quick forward
Task: Build a deep-narrow and a shallow-wide encoder using PyTorch’s TransformerEncoderLayer. Compare parameter counts and verify a forward pass.
Solution:
import torch, torch.nn as nn
def count_params(m): return sum(p.numel() for p in m.parameters())
deep_narrow = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=256, nhead=4, dim_feedforward=1024, batch_first=True),
num_layers=24
)
shallow_wide = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=1024, nhead=16, dim_feedforward=4096, batch_first=True),
num_layers=6
)
print("Deep-Narrow params:", count_params(deep_narrow))
print("Shallow-Wide params:", count_params(shallow_wide))
x_dn = torch.randn(2, 32, 256)
x_sw = torch.randn(2, 32, 1024)
print(deep_narrow(x_dn).shape, shallow_wide(x_sw).shape)Exercise 7 — SwiGLU feed-forward vs GELU
Task: Implement a SwiGLU FFN and compare output statistics with a standard GELU FFN.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class SwiGLU(nn.Module):
def __init__(self, d_in, d_hidden, d_out):
super().__init__()
self.w1 = nn.Linear(d_in, d_hidden)
self.w2 = nn.Linear(d_in, d_hidden)
self.proj = nn.Linear(d_hidden, d_out)
def forward(self, x):
return self.proj(F.silu(self.w1(x)) * self.w2(x))
gelu_ffn = nn.Sequential(nn.Linear(256, 1024), nn.GELU(), nn.Linear(1024, 256))
swiglu_ffn = SwiGLU(256, 1024, 256)
x = torch.randn(8, 16, 256)
y_gelu = gelu_ffn(x); y_swiglu = swiglu_ffn(x)
print(y_gelu.std().item(), y_swiglu.std().item())Exercise 8 — GQA: share K/V across query heads
Task: Create grouped-query attention projections where many Q heads share fewer K/V heads. Verify head mapping.
Solution:
import torch, torch.nn as nn
class SimpleGQA(nn.Module):
def __init__(self, d_model=64, q_heads=8, kv_heads=2):
super().__init__()
assert d_model % q_heads == 0
self.qh, self.kvh = q_heads, kv_heads
self.d = d_model // q_heads
self.q = nn.Linear(d_model, d_model)
self.k = nn.Linear(d_model, self.d * kv_heads)
self.v = nn.Linear(d_model, self.d * kv_heads)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.qh,self.d).transpose(1,2) # [B,qh,T,d]
k = self.k(x).view(B,T,self.kvh,self.d).transpose(1,2) # [B,kvh,T,d]
v = self.v(x).view(B,T,self.kvh,self.d).transpose(1,2)
# expand kv to match q heads by repeating each kv head equally
group = self.qh // self.kvh
k = k.repeat_interleave(group, dim=1) # [B,qh,T,d]
v = v.repeat_interleave(group, dim=1)
return q,k,v
x = torch.randn(1, 10, 64)
q,k,v = SimpleGQA()(x)
print(q.shape, k.shape, v.shape) # torch.Size([1,8,10,8]) eachExercise 9 — Local (sparse) attention window vs full attention
Task: Implement a sliding-window local attention and compare its output shape with full attention. (We’ll skip timing to keep it lightweight.)
Solution:
import torch, torch.nn.functional as F
def local_attention(x, window=2):
# x: [B,T,D]
B,T,D = x.shape
outs = []
for i in range(T):
s = max(0, i - window); e = min(T, i + window + 1)
ctx = x[:, s:e, :] # [B,W,D]
w = F.softmax(torch.bmm(x[:, i:i+1, :], ctx.transpose(1,2)), dim=-1) # [B,1,W]
outs.append(torch.bmm(w, ctx)) # [B,1,D]
return torch.cat(outs, dim=1) # [B,T,D]
B,T,D = 1, 12, 32
x = torch.randn(B,T,D)
y_local = local_attention(x, window=3)
print(y_local.shape) # torch.Size([1,12,32])Exercise 10 — (Concept) Choosing RoPE vs ALiBi
Task (no code):
You’re building a long-context summarizer (up to 64k tokens). You want robust generalization beyond training context and efficient inference. Which would you choose, RoPE or ALiBi, and why?
Sample answer:
I would start with RoPE because it encodes relative positions directly in Q/K and tends to extrapolate gracefully to longer contexts. If latency/memory becomes a concern or if experiments show better empirical stability, I would try ALiBi, which adds a linear distance bias to logits and remains simple to scale.
What you’ve practiced
- Building attention from scratch and visualizing the effect of RoPE/ALiBi.
- Comparing LayerNorm vs RMSNorm and understanding why pre-norm stacks help deep transformers.
- Seeing how depth vs width changes parameter counts and behavior.
- Implementing SwiGLU, GQA, and local (sparse) attention patterns that power modern LLMs.
Practical Exercises – Chapter 3
These exercises help you internalize multi-head attention, rotary embeddings (RoPE), normalization strategies, depth vs width, ALiBi, SwiGLU, GQA, and attention sparsity.
Exercise 1 — Scaled Dot-Product Attention (single head)
Task: Implement single-head scaled dot-product attention with an optional causal mask.
Solution:
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(q, k, v, causal=False):
# q,k,v: [B, T, D]
d = q.size(-1)
scores = q @ k.transpose(-2, -1) / (d ** 0.5) # [B, T, T]
if causal:
T = scores.size(-1)
mask = torch.triu(torch.ones(T, T, device=scores.device), diagonal=1).bool()
scores = scores.masked_fill(mask, float('-inf'))
weights = F.softmax(scores, dim=-1) # [B, T, T]
return weights @ v, weights # [B, T, D], [B, T, T]
# Quick check
B,T,D = 1,5,16
x = torch.randn(B,T,D)
y,_ = scaled_dot_product_attention(x,x,x,causal=True)
print(y.shape) # torch.Size([1,5,16])Exercise 2 — Minimal Multi-Head Self-Attention
Task: Build a tiny multi-head attention layer. Confirm output shapes.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class MiniMHA(nn.Module):
def __init__(self, embed_dim=32, num_heads=4):
super().__init__()
assert embed_dim % num_heads == 0
self.h = num_heads
self.d = embed_dim // num_heads
self.q = nn.Linear(embed_dim, embed_dim)
self.k = nn.Linear(embed_dim, embed_dim)
self.v = nn.Linear(embed_dim, embed_dim)
self.o = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.h,self.d).transpose(1,2) # [B,h,T,d]
k = self.k(x).view(B,T,self.h,self.d).transpose(1,2)
v = self.v(x).view(B,T,self.h,self.d).transpose(1,2)
scores = (q @ k.transpose(-2,-1)) / (self.d**0.5) # [B,h,T,T]
w = F.softmax(scores, dim=-1)
out = (w @ v).transpose(1,2).contiguous().view(B,T,C) # [B,T,C]
return self.o(out)
x = torch.randn(2, 6, 32)
mha = MiniMHA(32, 4)
print(mha(x).shape) # torch.Size([2, 6, 32])Exercise 3 — Add RoPE to Q/K and compare attention
Task: Apply a simple rotary position embedding to Q and K, then compare attention weights with/without RoPE.
Solution:
import torch, math, torch.nn.functional as F
def apply_rope(t, base=10000.0):
# t: [B,h,T,d]; assume d is even
B,h,T,d = t.shape
half = d//2
idx = torch.arange(half, device=t.device)
theta = (1.0 / (base ** (idx/half))).view(1,1,1,half) # [1,1,1,half]
pos = torch.arange(T, device=t.device).view(1,1,T,1)
angles = pos * theta # [1,1,T,half]
sin, cos = torch.sin(angles), torch.cos(angles)
t1, t2 = t[..., :half], t[..., half:]
return torch.cat([t1*cos - t2*sin, t1*sin + t2*cos], dim=-1)
# Compare weights
B,h,T,d = 1, 2, 6, 8
q = torch.randn(B,h,T,d)
k = torch.randn(B,h,T,d)
# vanilla
w0 = F.softmax((q @ k.transpose(-2,-1))/ (d**0.5), dim=-1)
# with RoPE
q_r = apply_rope(q); k_r = apply_rope(k)
w1 = F.softmax((q_r @ k_r.transpose(-2,-1))/ (d**0.5), dim=-1)
print("Δ mean:", (w1 - w0).abs().mean().item())Exercise 4 — Implement ALiBi bias and plug into attention
Task: Create an ALiBi bias tensor and add it to attention logits before softmax.
Solution:
import torch, torch.nn.functional as F
def alibi_bias(seq_len, num_heads):
# simple geometric head slopes (one of several choices in the wild)
slopes = torch.tensor([1.0 / (2 ** (i/num_heads)) for i in range(num_heads)])
i = torch.arange(seq_len).unsqueeze(1)
j = torch.arange(seq_len).unsqueeze(0)
dist = (i - j).abs().float() # [T,T]
bias = -dist.unsqueeze(0) * slopes.view(num_heads,1,1) # [h,T,T]
return bias
B,h,T,d = 1, 4, 8, 16
q = torch.randn(B,h,T,d); k = torch.randn(B,h,T,d); v = torch.randn(B,h,T,d)
scores = (q @ k.transpose(-2,-1)) / (d**0.5) # [B,h,T,T]
bias = alibi_bias(T, h).to(scores.device).unsqueeze(0) # [1,h,T,T]
w = F.softmax(scores + bias, dim=-1)
out = w @ v
print(out.shape) # torch.Size([1,4,8,16])Exercise 5 — LayerNorm vs RMSNorm stability check
Task: Compare LayerNorm and RMSNorm on random activations and inspect their statistics.
Solution:
import torch, torch.nn as nn
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-8):
super().__init__()
self.scale = nn.Parameter(torch.ones(dim))
self.eps = eps
def forward(self, x):
# normalize by root-mean-square over features
rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()
return self.scale * (x / (rms + self.eps))
x = torch.randn(64, 32) * 3.0 # "messy" activations
ln = nn.LayerNorm(32)
rms = RMSNorm(32)
y_ln = ln(x); y_rms = rms(x)
print(y_ln.mean().item(), y_ln.std().item())
print(y_rms.mean().item(), y_rms.std().item())Exercise 6 — Depth vs Width: parameter counts & quick forward
Task: Build a deep-narrow and a shallow-wide encoder using PyTorch’s TransformerEncoderLayer. Compare parameter counts and verify a forward pass.
Solution:
import torch, torch.nn as nn
def count_params(m): return sum(p.numel() for p in m.parameters())
deep_narrow = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=256, nhead=4, dim_feedforward=1024, batch_first=True),
num_layers=24
)
shallow_wide = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=1024, nhead=16, dim_feedforward=4096, batch_first=True),
num_layers=6
)
print("Deep-Narrow params:", count_params(deep_narrow))
print("Shallow-Wide params:", count_params(shallow_wide))
x_dn = torch.randn(2, 32, 256)
x_sw = torch.randn(2, 32, 1024)
print(deep_narrow(x_dn).shape, shallow_wide(x_sw).shape)Exercise 7 — SwiGLU feed-forward vs GELU
Task: Implement a SwiGLU FFN and compare output statistics with a standard GELU FFN.
Solution:
import torch, torch.nn as nn, torch.nn.functional as F
class SwiGLU(nn.Module):
def __init__(self, d_in, d_hidden, d_out):
super().__init__()
self.w1 = nn.Linear(d_in, d_hidden)
self.w2 = nn.Linear(d_in, d_hidden)
self.proj = nn.Linear(d_hidden, d_out)
def forward(self, x):
return self.proj(F.silu(self.w1(x)) * self.w2(x))
gelu_ffn = nn.Sequential(nn.Linear(256, 1024), nn.GELU(), nn.Linear(1024, 256))
swiglu_ffn = SwiGLU(256, 1024, 256)
x = torch.randn(8, 16, 256)
y_gelu = gelu_ffn(x); y_swiglu = swiglu_ffn(x)
print(y_gelu.std().item(), y_swiglu.std().item())Exercise 8 — GQA: share K/V across query heads
Task: Create grouped-query attention projections where many Q heads share fewer K/V heads. Verify head mapping.
Solution:
import torch, torch.nn as nn
class SimpleGQA(nn.Module):
def __init__(self, d_model=64, q_heads=8, kv_heads=2):
super().__init__()
assert d_model % q_heads == 0
self.qh, self.kvh = q_heads, kv_heads
self.d = d_model // q_heads
self.q = nn.Linear(d_model, d_model)
self.k = nn.Linear(d_model, self.d * kv_heads)
self.v = nn.Linear(d_model, self.d * kv_heads)
def forward(self, x):
B,T,C = x.shape
q = self.q(x).view(B,T,self.qh,self.d).transpose(1,2) # [B,qh,T,d]
k = self.k(x).view(B,T,self.kvh,self.d).transpose(1,2) # [B,kvh,T,d]
v = self.v(x).view(B,T,self.kvh,self.d).transpose(1,2)
# expand kv to match q heads by repeating each kv head equally
group = self.qh // self.kvh
k = k.repeat_interleave(group, dim=1) # [B,qh,T,d]
v = v.repeat_interleave(group, dim=1)
return q,k,v
x = torch.randn(1, 10, 64)
q,k,v = SimpleGQA()(x)
print(q.shape, k.shape, v.shape) # torch.Size([1,8,10,8]) eachExercise 9 — Local (sparse) attention window vs full attention
Task: Implement a sliding-window local attention and compare its output shape with full attention. (We’ll skip timing to keep it lightweight.)
Solution:
import torch, torch.nn.functional as F
def local_attention(x, window=2):
# x: [B,T,D]
B,T,D = x.shape
outs = []
for i in range(T):
s = max(0, i - window); e = min(T, i + window + 1)
ctx = x[:, s:e, :] # [B,W,D]
w = F.softmax(torch.bmm(x[:, i:i+1, :], ctx.transpose(1,2)), dim=-1) # [B,1,W]
outs.append(torch.bmm(w, ctx)) # [B,1,D]
return torch.cat(outs, dim=1) # [B,T,D]
B,T,D = 1, 12, 32
x = torch.randn(B,T,D)
y_local = local_attention(x, window=3)
print(y_local.shape) # torch.Size([1,12,32])Exercise 10 — (Concept) Choosing RoPE vs ALiBi
Task (no code):
You’re building a long-context summarizer (up to 64k tokens). You want robust generalization beyond training context and efficient inference. Which would you choose, RoPE or ALiBi, and why?
Sample answer:
I would start with RoPE because it encodes relative positions directly in Q/K and tends to extrapolate gracefully to longer contexts. If latency/memory becomes a concern or if experiments show better empirical stability, I would try ALiBi, which adds a linear distance bias to logits and remains simple to scale.
What you’ve practiced
- Building attention from scratch and visualizing the effect of RoPE/ALiBi.
- Comparing LayerNorm vs RMSNorm and understanding why pre-norm stacks help deep transformers.
- Seeing how depth vs width changes parameter counts and behavior.
- Implementing SwiGLU, GQA, and local (sparse) attention patterns that power modern LLMs.
