Chapter 4: Training LLMs from Scratch
Practical Exercises β Chapter 4
These exercises help you put into practice the fundamentals of LLM training pipelines: data preparation, scheduling, distributed infrastructure, and cost-awareness.
Exercise 1 – Data Cleaning
Task: Write a function that removes HTML tags and normalizes whitespace in a text string.
Solution:
import re
def clean_text(text):
# Remove HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
sample = "<p>Hello, world!</p> This is <b>messy</b> text."
print(clean_text(sample))
# Output: "Hello, world! This is messy text."
Exercise 2 – Deduplication with MinHash
Task: Use MinHash to detect near-duplicate documents in a small dataset.
Solution:
from datasketch import MinHash, MinHashLSH
def get_minhash(text, num_perm=128):
m = MinHash(num_perm=num_perm)
for word in set(text.split()):
m.update(word.encode("utf8"))
return m
docs = [
"The cat sat on the mat.",
"The cat is sitting on the mat.",
"A completely unrelated document."
]
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for i, d in enumerate(docs):
lsh.insert(f"doc{i}", get_minhash(d))
query = get_minhash("The cat sat on the mat.")
print("Near duplicates:", lsh.query(query))
# Likely matches doc0 and doc1
Exercise 3 – Curriculum Learning
Task: Create a simple curriculum schedule where the model sees mostly clean data in early epochs and more noisy data in later epochs.
Solution:
datasets = {
"clean": ["A reliable sentence.", "Another factual line."],
"noisy": ["Buy now!!! $$$", "Click here for free prizes!"]
}
def curriculum(epoch):
if epoch == 1:
return datasets["clean"] * 3 + datasets["noisy"] * 1
elif epoch == 2:
return datasets["clean"] * 2 + datasets["noisy"] * 2
else:
return datasets["clean"] * 1 + datasets["noisy"] * 3
print("Epoch 1:", curriculum(1))
print("Epoch 2:", curriculum(2))
print("Epoch 3:", curriculum(3))
Exercise 4 – Mixture Dataset Sampling
Task: Combine books, Wikipedia, and code with weights 0.5, 0.3, 0.2, and sample training batches.
Solution:
import random
datasets = {
"books": ["Book line 1", "Book line 2"],
"wiki": ["Wiki entry 1", "Wiki entry 2"],
"code": ["def add(a,b): return a+b", "print('Hello')"]
}
weights = {"books": 0.5, "wiki": 0.3, "code": 0.2}
def sample_batch(n=5):
return [
random.choice(datasets[random.choices(list(weights.keys()), weights.values())[0]])
for _ in range(n)
]
print("Sample batch:", sample_batch(5))
Exercise 5 – Synthetic Data Generation
Task: Imagine you don’t have enough domain-specific data. Use an LLM API (pseudo-code here) to generate synthetic QA pairs.
Solution:
# Pseudo-code (requires valid API key)
from openai import OpenAI
client = OpenAI()
prompt = "Generate 2 Q&A pairs about renewable energy."
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message["content"])
Exercise 6 – Distributed Training (DDP)
Task: Write a PyTorch Distributed Data Parallel (DDP) example with two GPUs training a simple linear model.
Solution:
# Save as train.py and run: python -m torch.distributed.run --nproc_per_node=2 train.py
import torch, torch.nn as nn, torch.distributed as dist, torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def train(rank, world_size):
dist.init_process_group("gloo", rank=rank, world_size=world_size)
model = nn.Linear(10, 10).to(rank)
ddp = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
opt = torch.optim.SGD(ddp.parameters(), lr=0.01)
for _ in range(5):
x = torch.randn(8, 10).to(rank)
y = torch.randn(8, 10).to(rank)
opt.zero_grad()
out = ddp(x)
loss = loss_fn(out, y)
loss.backward()
opt.step()
dist.destroy_process_group()
if __name__ == "__main__":
mp.spawn(train, args=(2,), nprocs=2)
Exercise 7 – Mixed Precision Training
Task: Optimize memory and speed using PyTorch AMP.
Solution:
import torch, torch.nn as nn
model = nn.Linear(1024, 1024).cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler()
for _ in range(3):
x = torch.randn(16, 1024).cuda()
y = torch.randn(16, 1024).cuda()
opt.zero_grad()
with torch.cuda.amp.autocast():
out = model(x)
loss = ((out - y) ** 2).mean()
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
Exercise 8 – Estimate Carbon Footprint of a Training Run
Task: Given 8 GPUs (400W each) running for 12 hours, estimate energy use and CO₂ impact (assume 0.5 kg CO₂/kWh).
Solution:
gpu_power_watts = 400
num_gpus = 8
hours = 12
carbon_factor = 0.5 # kg CO₂ per kWh
energy_kwh = (gpu_power_watts * num_gpus * hours) / 1000
co2_emission = energy_kwh * carbon_factor
print(f"Energy used: {energy_kwh} kWh")
print(f"CO₂ emitted: {co2_emission:.2f} kg")
What You Practiced
- Cleaning, deduplicating, and filtering data.
- Designing curriculum learning schedules.
- Sampling mixture datasets and generating synthetic data.
- Running distributed training with PyTorch.
- Optimizing training with mixed precision and checkpointing.
- Estimating carbon footprint for responsible AI.
These exercises give you a practical foundation for managing the entire training pipeline — from raw data to sustainable infrastructure.
Practical Exercises β Chapter 4
These exercises help you put into practice the fundamentals of LLM training pipelines: data preparation, scheduling, distributed infrastructure, and cost-awareness.
Exercise 1 – Data Cleaning
Task: Write a function that removes HTML tags and normalizes whitespace in a text string.
Solution:
import re
def clean_text(text):
# Remove HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
sample = "<p>Hello, world!</p> This is <b>messy</b> text."
print(clean_text(sample))
# Output: "Hello, world! This is messy text."
Exercise 2 – Deduplication with MinHash
Task: Use MinHash to detect near-duplicate documents in a small dataset.
Solution:
from datasketch import MinHash, MinHashLSH
def get_minhash(text, num_perm=128):
m = MinHash(num_perm=num_perm)
for word in set(text.split()):
m.update(word.encode("utf8"))
return m
docs = [
"The cat sat on the mat.",
"The cat is sitting on the mat.",
"A completely unrelated document."
]
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for i, d in enumerate(docs):
lsh.insert(f"doc{i}", get_minhash(d))
query = get_minhash("The cat sat on the mat.")
print("Near duplicates:", lsh.query(query))
# Likely matches doc0 and doc1
Exercise 3 – Curriculum Learning
Task: Create a simple curriculum schedule where the model sees mostly clean data in early epochs and more noisy data in later epochs.
Solution:
datasets = {
"clean": ["A reliable sentence.", "Another factual line."],
"noisy": ["Buy now!!! $$$", "Click here for free prizes!"]
}
def curriculum(epoch):
if epoch == 1:
return datasets["clean"] * 3 + datasets["noisy"] * 1
elif epoch == 2:
return datasets["clean"] * 2 + datasets["noisy"] * 2
else:
return datasets["clean"] * 1 + datasets["noisy"] * 3
print("Epoch 1:", curriculum(1))
print("Epoch 2:", curriculum(2))
print("Epoch 3:", curriculum(3))
Exercise 4 – Mixture Dataset Sampling
Task: Combine books, Wikipedia, and code with weights 0.5, 0.3, 0.2, and sample training batches.
Solution:
import random
datasets = {
"books": ["Book line 1", "Book line 2"],
"wiki": ["Wiki entry 1", "Wiki entry 2"],
"code": ["def add(a,b): return a+b", "print('Hello')"]
}
weights = {"books": 0.5, "wiki": 0.3, "code": 0.2}
def sample_batch(n=5):
return [
random.choice(datasets[random.choices(list(weights.keys()), weights.values())[0]])
for _ in range(n)
]
print("Sample batch:", sample_batch(5))
Exercise 5 – Synthetic Data Generation
Task: Imagine you don’t have enough domain-specific data. Use an LLM API (pseudo-code here) to generate synthetic QA pairs.
Solution:
# Pseudo-code (requires valid API key)
from openai import OpenAI
client = OpenAI()
prompt = "Generate 2 Q&A pairs about renewable energy."
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message["content"])
Exercise 6 – Distributed Training (DDP)
Task: Write a PyTorch Distributed Data Parallel (DDP) example with two GPUs training a simple linear model.
Solution:
# Save as train.py and run: python -m torch.distributed.run --nproc_per_node=2 train.py
import torch, torch.nn as nn, torch.distributed as dist, torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def train(rank, world_size):
dist.init_process_group("gloo", rank=rank, world_size=world_size)
model = nn.Linear(10, 10).to(rank)
ddp = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
opt = torch.optim.SGD(ddp.parameters(), lr=0.01)
for _ in range(5):
x = torch.randn(8, 10).to(rank)
y = torch.randn(8, 10).to(rank)
opt.zero_grad()
out = ddp(x)
loss = loss_fn(out, y)
loss.backward()
opt.step()
dist.destroy_process_group()
if __name__ == "__main__":
mp.spawn(train, args=(2,), nprocs=2)
Exercise 7 – Mixed Precision Training
Task: Optimize memory and speed using PyTorch AMP.
Solution:
import torch, torch.nn as nn
model = nn.Linear(1024, 1024).cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler()
for _ in range(3):
x = torch.randn(16, 1024).cuda()
y = torch.randn(16, 1024).cuda()
opt.zero_grad()
with torch.cuda.amp.autocast():
out = model(x)
loss = ((out - y) ** 2).mean()
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
Exercise 8 – Estimate Carbon Footprint of a Training Run
Task: Given 8 GPUs (400W each) running for 12 hours, estimate energy use and CO₂ impact (assume 0.5 kg CO₂/kWh).
Solution:
gpu_power_watts = 400
num_gpus = 8
hours = 12
carbon_factor = 0.5 # kg CO₂ per kWh
energy_kwh = (gpu_power_watts * num_gpus * hours) / 1000
co2_emission = energy_kwh * carbon_factor
print(f"Energy used: {energy_kwh} kWh")
print(f"CO₂ emitted: {co2_emission:.2f} kg")
What You Practiced
- Cleaning, deduplicating, and filtering data.
- Designing curriculum learning schedules.
- Sampling mixture datasets and generating synthetic data.
- Running distributed training with PyTorch.
- Optimizing training with mixed precision and checkpointing.
- Estimating carbon footprint for responsible AI.
These exercises give you a practical foundation for managing the entire training pipeline — from raw data to sustainable infrastructure.
Practical Exercises β Chapter 4
These exercises help you put into practice the fundamentals of LLM training pipelines: data preparation, scheduling, distributed infrastructure, and cost-awareness.
Exercise 1 – Data Cleaning
Task: Write a function that removes HTML tags and normalizes whitespace in a text string.
Solution:
import re
def clean_text(text):
# Remove HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
sample = "<p>Hello, world!</p> This is <b>messy</b> text."
print(clean_text(sample))
# Output: "Hello, world! This is messy text."
Exercise 2 – Deduplication with MinHash
Task: Use MinHash to detect near-duplicate documents in a small dataset.
Solution:
from datasketch import MinHash, MinHashLSH
def get_minhash(text, num_perm=128):
m = MinHash(num_perm=num_perm)
for word in set(text.split()):
m.update(word.encode("utf8"))
return m
docs = [
"The cat sat on the mat.",
"The cat is sitting on the mat.",
"A completely unrelated document."
]
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for i, d in enumerate(docs):
lsh.insert(f"doc{i}", get_minhash(d))
query = get_minhash("The cat sat on the mat.")
print("Near duplicates:", lsh.query(query))
# Likely matches doc0 and doc1
Exercise 3 – Curriculum Learning
Task: Create a simple curriculum schedule where the model sees mostly clean data in early epochs and more noisy data in later epochs.
Solution:
datasets = {
"clean": ["A reliable sentence.", "Another factual line."],
"noisy": ["Buy now!!! $$$", "Click here for free prizes!"]
}
def curriculum(epoch):
if epoch == 1:
return datasets["clean"] * 3 + datasets["noisy"] * 1
elif epoch == 2:
return datasets["clean"] * 2 + datasets["noisy"] * 2
else:
return datasets["clean"] * 1 + datasets["noisy"] * 3
print("Epoch 1:", curriculum(1))
print("Epoch 2:", curriculum(2))
print("Epoch 3:", curriculum(3))
Exercise 4 – Mixture Dataset Sampling
Task: Combine books, Wikipedia, and code with weights 0.5, 0.3, 0.2, and sample training batches.
Solution:
import random
datasets = {
"books": ["Book line 1", "Book line 2"],
"wiki": ["Wiki entry 1", "Wiki entry 2"],
"code": ["def add(a,b): return a+b", "print('Hello')"]
}
weights = {"books": 0.5, "wiki": 0.3, "code": 0.2}
def sample_batch(n=5):
return [
random.choice(datasets[random.choices(list(weights.keys()), weights.values())[0]])
for _ in range(n)
]
print("Sample batch:", sample_batch(5))
Exercise 5 – Synthetic Data Generation
Task: Imagine you don’t have enough domain-specific data. Use an LLM API (pseudo-code here) to generate synthetic QA pairs.
Solution:
# Pseudo-code (requires valid API key)
from openai import OpenAI
client = OpenAI()
prompt = "Generate 2 Q&A pairs about renewable energy."
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message["content"])
Exercise 6 – Distributed Training (DDP)
Task: Write a PyTorch Distributed Data Parallel (DDP) example with two GPUs training a simple linear model.
Solution:
# Save as train.py and run: python -m torch.distributed.run --nproc_per_node=2 train.py
import torch, torch.nn as nn, torch.distributed as dist, torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def train(rank, world_size):
dist.init_process_group("gloo", rank=rank, world_size=world_size)
model = nn.Linear(10, 10).to(rank)
ddp = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
opt = torch.optim.SGD(ddp.parameters(), lr=0.01)
for _ in range(5):
x = torch.randn(8, 10).to(rank)
y = torch.randn(8, 10).to(rank)
opt.zero_grad()
out = ddp(x)
loss = loss_fn(out, y)
loss.backward()
opt.step()
dist.destroy_process_group()
if __name__ == "__main__":
mp.spawn(train, args=(2,), nprocs=2)
Exercise 7 – Mixed Precision Training
Task: Optimize memory and speed using PyTorch AMP.
Solution:
import torch, torch.nn as nn
model = nn.Linear(1024, 1024).cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler()
for _ in range(3):
x = torch.randn(16, 1024).cuda()
y = torch.randn(16, 1024).cuda()
opt.zero_grad()
with torch.cuda.amp.autocast():
out = model(x)
loss = ((out - y) ** 2).mean()
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
Exercise 8 – Estimate Carbon Footprint of a Training Run
Task: Given 8 GPUs (400W each) running for 12 hours, estimate energy use and CO₂ impact (assume 0.5 kg CO₂/kWh).
Solution:
gpu_power_watts = 400
num_gpus = 8
hours = 12
carbon_factor = 0.5 # kg CO₂ per kWh
energy_kwh = (gpu_power_watts * num_gpus * hours) / 1000
co2_emission = energy_kwh * carbon_factor
print(f"Energy used: {energy_kwh} kWh")
print(f"CO₂ emitted: {co2_emission:.2f} kg")
What You Practiced
- Cleaning, deduplicating, and filtering data.
- Designing curriculum learning schedules.
- Sampling mixture datasets and generating synthetic data.
- Running distributed training with PyTorch.
- Optimizing training with mixed precision and checkpointing.
- Estimating carbon footprint for responsible AI.
These exercises give you a practical foundation for managing the entire training pipeline — from raw data to sustainable infrastructure.
Practical Exercises β Chapter 4
These exercises help you put into practice the fundamentals of LLM training pipelines: data preparation, scheduling, distributed infrastructure, and cost-awareness.
Exercise 1 – Data Cleaning
Task: Write a function that removes HTML tags and normalizes whitespace in a text string.
Solution:
import re
def clean_text(text):
# Remove HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
sample = "<p>Hello, world!</p> This is <b>messy</b> text."
print(clean_text(sample))
# Output: "Hello, world! This is messy text."
Exercise 2 – Deduplication with MinHash
Task: Use MinHash to detect near-duplicate documents in a small dataset.
Solution:
from datasketch import MinHash, MinHashLSH
def get_minhash(text, num_perm=128):
m = MinHash(num_perm=num_perm)
for word in set(text.split()):
m.update(word.encode("utf8"))
return m
docs = [
"The cat sat on the mat.",
"The cat is sitting on the mat.",
"A completely unrelated document."
]
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for i, d in enumerate(docs):
lsh.insert(f"doc{i}", get_minhash(d))
query = get_minhash("The cat sat on the mat.")
print("Near duplicates:", lsh.query(query))
# Likely matches doc0 and doc1
Exercise 3 – Curriculum Learning
Task: Create a simple curriculum schedule where the model sees mostly clean data in early epochs and more noisy data in later epochs.
Solution:
datasets = {
"clean": ["A reliable sentence.", "Another factual line."],
"noisy": ["Buy now!!! $$$", "Click here for free prizes!"]
}
def curriculum(epoch):
if epoch == 1:
return datasets["clean"] * 3 + datasets["noisy"] * 1
elif epoch == 2:
return datasets["clean"] * 2 + datasets["noisy"] * 2
else:
return datasets["clean"] * 1 + datasets["noisy"] * 3
print("Epoch 1:", curriculum(1))
print("Epoch 2:", curriculum(2))
print("Epoch 3:", curriculum(3))
Exercise 4 – Mixture Dataset Sampling
Task: Combine books, Wikipedia, and code with weights 0.5, 0.3, 0.2, and sample training batches.
Solution:
import random
datasets = {
"books": ["Book line 1", "Book line 2"],
"wiki": ["Wiki entry 1", "Wiki entry 2"],
"code": ["def add(a,b): return a+b", "print('Hello')"]
}
weights = {"books": 0.5, "wiki": 0.3, "code": 0.2}
def sample_batch(n=5):
return [
random.choice(datasets[random.choices(list(weights.keys()), weights.values())[0]])
for _ in range(n)
]
print("Sample batch:", sample_batch(5))
Exercise 5 – Synthetic Data Generation
Task: Imagine you don’t have enough domain-specific data. Use an LLM API (pseudo-code here) to generate synthetic QA pairs.
Solution:
# Pseudo-code (requires valid API key)
from openai import OpenAI
client = OpenAI()
prompt = "Generate 2 Q&A pairs about renewable energy."
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message["content"])
Exercise 6 – Distributed Training (DDP)
Task: Write a PyTorch Distributed Data Parallel (DDP) example with two GPUs training a simple linear model.
Solution:
# Save as train.py and run: python -m torch.distributed.run --nproc_per_node=2 train.py
import torch, torch.nn as nn, torch.distributed as dist, torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def train(rank, world_size):
dist.init_process_group("gloo", rank=rank, world_size=world_size)
model = nn.Linear(10, 10).to(rank)
ddp = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
opt = torch.optim.SGD(ddp.parameters(), lr=0.01)
for _ in range(5):
x = torch.randn(8, 10).to(rank)
y = torch.randn(8, 10).to(rank)
opt.zero_grad()
out = ddp(x)
loss = loss_fn(out, y)
loss.backward()
opt.step()
dist.destroy_process_group()
if __name__ == "__main__":
mp.spawn(train, args=(2,), nprocs=2)
Exercise 7 – Mixed Precision Training
Task: Optimize memory and speed using PyTorch AMP.
Solution:
import torch, torch.nn as nn
model = nn.Linear(1024, 1024).cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = torch.cuda.amp.GradScaler()
for _ in range(3):
x = torch.randn(16, 1024).cuda()
y = torch.randn(16, 1024).cuda()
opt.zero_grad()
with torch.cuda.amp.autocast():
out = model(x)
loss = ((out - y) ** 2).mean()
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
Exercise 8 – Estimate Carbon Footprint of a Training Run
Task: Given 8 GPUs (400W each) running for 12 hours, estimate energy use and CO₂ impact (assume 0.5 kg CO₂/kWh).
Solution:
gpu_power_watts = 400
num_gpus = 8
hours = 12
carbon_factor = 0.5 # kg CO₂ per kWh
energy_kwh = (gpu_power_watts * num_gpus * hours) / 1000
co2_emission = energy_kwh * carbon_factor
print(f"Energy used: {energy_kwh} kWh")
print(f"CO₂ emitted: {co2_emission:.2f} kg")
What You Practiced
- Cleaning, deduplicating, and filtering data.
- Designing curriculum learning schedules.
- Sampling mixture datasets and generating synthetic data.
- Running distributed training with PyTorch.
- Optimizing training with mixed precision and checkpointing.
- Estimating carbon footprint for responsible AI.
These exercises give you a practical foundation for managing the entire training pipeline — from raw data to sustainable infrastructure.

