Backpropagation (“backprop”) is the heart of modern neural network training. It enables us to efficiently compute gradients for all parameters, making deep learning—and deep RL—practically feasible. In this post you’ll:
.backward()
in PyTorch to automate and verify gradient calculations.Grasping backprop is the difference between “using” neural nets and truly understanding them!
Consider a two-layer neural net (no bias for simplicity) for one sample :
For a single output and target , use binary cross-entropy:
To optimize, we must compute gradients:
Via chain rule:
Each node (“layer output”) passes gradients backward to previous layers—hence the name backpropagation.
In code, backprop means:
.backward()
results gives confidence in your
math and understanding.Let’s use simple numbers for hand calculation:
Let , shape , shape , target .
import torch
import torch.nn.functional as F
# Input and target
x: torch.Tensor = torch.tensor([[1.0, 2.0]]) # (1, 2)
y: torch.Tensor = torch.tensor([1.0]) # (1,)
# Parameters (fixed small values for hand calc)
W1: torch.Tensor = torch.tensor([[0.1, -0.2],
[0.3, 0.4]], requires_grad=True) # (2,2)
W2: torch.Tensor = torch.tensor([[0.7, -0.5]], requires_grad=True) # (1,2)
# Forward pass (ReLU activation)
z1: torch.Tensor = x @ W1 # (1,2)
h: torch.Tensor = F.relu(z1) # (1,2)
z2: torch.Tensor = h @ W2.T # (1,1)
y_pred: torch.Tensor = torch.sigmoid(z2).squeeze()# scalar
# Binary cross-entropy loss
eps: float = 1e-7
loss: torch.Tensor = - (y * torch.log(y_pred + eps) + (1 - y) * torch.log(1 - y_pred + eps))
print("Forward values:")
print("z1 =", z1.tolist())
print("h =", h.tolist())
print("z2 =", z2.item())
print("y_pred =", y_pred.item())
print("loss =", loss.item())
# Manually compute:
# 1. dL/dy_pred = -1/y_pred
dL_dypred: float = float(-1.0 / y_pred.item())
# 2. dy_pred/dz2 = sigmoid'(z2)
dypred_dz2: float = float(y_pred.item() * (1 - y_pred.item()))
print("Manual dL/dy_pred:", dL_dypred)
print("Manual dy_pred/dz2:", dypred_dz2)
Now you can hand-multiply through the chain!
.backward()
to Compare with Manual Gradients# Backpropagation (PyTorch autograd)
# Zero gradients first
if W1.grad is not None: W1.grad.zero_()
if W2.grad is not None: W2.grad.zero_()
loss.backward()
print("PyTorch dL/dW2:\n", W2.grad)
print("PyTorch dL/dW1:\n", W1.grad)
You can now compare these with your manual chain calculation above!
Let’s use a bigger network and plot mean gradients at each parameter.
import matplotlib.pyplot as plt
import torch.nn as nn
class TinyMLP(nn.Module):
def __init__(self, hidden: int = 6) -> None:
super().__init__()
self.fc1: nn.Linear = nn.Linear(2, hidden)
self.fc2: nn.Linear = nn.Linear(hidden, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.fc2(F.relu(self.fc1(x)))
torch.manual_seed(11)
N: int = 200
X: torch.Tensor = torch.randn(N, 2)
y: torch.Tensor = (X[:,0] + X[:,1] > 0).long()
mlp: TinyMLP = TinyMLP(10)
opt: torch.optim.Optimizer = torch.optim.Adam(mlp.parameters(), lr=0.1)
grad1: list[float] = []
grad2: list[float] = []
for epoch in range(60):
logits = mlp(X)
loss = F.cross_entropy(logits, y)
opt.zero_grad()
loss.backward()
grad1.append(mlp.fc1.weight.grad.abs().mean().item())
grad2.append(mlp.fc2.weight.grad.abs().mean().item())
opt.step()
plt.plot(grad1, label="fc1")
plt.plot(grad2, label="fc2")
plt.xlabel("Epoch"); plt.ylabel("Mean Abs Grad")
plt.title("Gradient Flow in MLP")
plt.legend(); plt.grid(True); plt.show()
We’ll deliberately cause vanishing gradients with a sigmoid activation.
class DeepMLP(nn.Module):
def __init__(self, hidden: int = 32, depth: int = 6) -> None:
super().__init__()
self.layers: nn.ModuleList = nn.ModuleList([nn.Linear(2 if i==0 else hidden, hidden) for i in range(depth)])
self.out: nn.Linear = nn.Linear(hidden, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
for l in self.layers:
x = torch.sigmoid(l(x)) # Deliberate: will squash gradients
return self.out(x)
torch.manual_seed(21)
deep_mlp: DeepMLP = DeepMLP()
opt: torch.optim.Optimizer = torch.optim.Adam(deep_mlp.parameters(), lr=0.07)
grad_hist: list[float] = []
for epoch in range(30):
logits = deep_mlp(X)
loss = F.cross_entropy(logits, y)
opt.zero_grad()
loss.backward()
# Monitor average gradient in every layer
mean_grad = torch.stack([l.weight.grad.abs().mean() for l in deep_mlp.layers]).mean().item()
grad_hist.append(mean_grad)
opt.step()
plt.plot(grad_hist)
plt.title("Vanishing Gradient in Deep Sigmoid Network")
plt.xlabel("Epoch"); plt.ylabel("Mean Gradient (all hidden layers)")
plt.grid(True); plt.show()
# Try switching to ReLU
class DeepMLPrelu(nn.Module):
def __init__(self, hidden: int = 32, depth: int = 6) -> None:
super().__init__()
self.layers: nn.ModuleList = nn.ModuleList([nn.Linear(2 if i==0 else hidden, hidden) for i in range(depth)])
self.out: nn.Linear = nn.Linear(hidden, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
for l in self.layers:
x = F.relu(l(x))
return self.out(x)
deep_mlp_relu: DeepMLPrelu = DeepMLPrelu()
opt2: torch.optim.Optimizer = torch.optim.Adam(deep_mlp_relu.parameters(), lr=0.07)
grad_hist_relu: list[float] = []
for epoch in range(30):
logits = deep_mlp_relu(X)
loss = F.cross_entropy(logits, y)
opt2.zero_grad()
loss.backward()
mean_grad = torch.stack([l.weight.grad.abs().mean() for l in deep_mlp_relu.layers]).mean().item()
grad_hist_relu.append(mean_grad)
opt2.step()
plt.plot(grad_hist, label='Sigmoid')
plt.plot(grad_hist_relu, label='ReLU')
plt.xlabel("Epoch"); plt.ylabel("Mean Gradient")
plt.title("Vanishing Gradients: Sigmoid vs ReLU")
plt.legend(); plt.grid(True); plt.show()
.backward()
to Compare with Manual Gradients.backward()
on the loss and print the gradients for and .import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
# EXERCISE 1/2
x: torch.Tensor = torch.tensor([[2.0, 1.0]]) # shape (1, 2)
y: torch.Tensor = torch.tensor([0.0]) # batch size 1
W1: torch.Tensor = torch.tensor([[0.2, -0.3], [0.5, 0.4]], requires_grad=True) # (2,2)
W2: torch.Tensor = torch.tensor([[0.6, -0.7]], requires_grad=True) # (1,2)
z1: torch.Tensor = x @ W1 # (1,2)
h: torch.Tensor = F.relu(z1) # (1,2)
z2: torch.Tensor = h @ W2.T # (1,1)
y_pred: torch.Tensor = torch.sigmoid(z2).squeeze() # scalar
loss: torch.Tensor = - (y * torch.log(y_pred + 1e-7) + (1-y) * torch.log(1 - y_pred + 1e-7))
print("Loss:", loss.item())
if W1.grad is not None: W1.grad.zero_()
if W2.grad is not None: W2.grad.zero_()
loss.backward()
print("PyTorch dL/dW2:", W2.grad)
print("PyTorch dL/dW1:", W1.grad)
# EXERCISE 3
class Net2(nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc1: nn.Linear = nn.Linear(2, 10)
self.fc2: nn.Linear = nn.Linear(10, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.fc2(F.relu(self.fc1(x)))
N: int = 150
X: torch.Tensor = torch.randn(N, 2)
y: torch.Tensor = (X[:,0] - X[:,1] > 0).long()
net: Net2 = Net2()
opt: torch.optim.Optimizer = torch.optim.Adam(net.parameters(), lr=0.10)
g1: list[float] = []
g2: list[float] = []
for epoch in range(50):
logits = net(X)
loss = F.cross_entropy(logits, y)
opt.zero_grad()
loss.backward()
g1.append(net.fc1.weight.grad.abs().mean().item())
g2.append(net.fc2.weight.grad.abs().mean().item())
opt.step()
plt.plot(g1, label="fc1 (input)")
plt.plot(g2, label="fc2 (out)")
plt.xlabel("Epoch"); plt.ylabel("Mean |grad|")
plt.legend(); plt.grid(True); plt.title("Gradient flow in NN"); plt.show()
# EXERCISE 4
class DeepSigNet(nn.Module):
def __init__(self, hidden: int = 24, depth: int = 6) -> None:
super().__init__()
self.layers: nn.ModuleList = nn.ModuleList([nn.Linear(2 if i==0 else hidden, hidden) for i in range(depth)])
self.out: nn.Linear = nn.Linear(hidden, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
for l in self.layers:
x = torch.sigmoid(l(x))
return self.out(x)
deepnet = DeepSigNet()
opt = torch.optim.Adam(deepnet.parameters(), lr=0.09)
g_hist = []
for epoch in range(25):
logits = deepnet(X)
loss = F.cross_entropy(logits, y)
opt.zero_grad()
loss.backward()
grads = [l.weight.grad.abs().mean().item() for l in deepnet.layers]
g_hist.append(sum(grads)/len(grads))
opt.step()
plt.plot(g_hist, label='Sigmoid')
plt.title("Vanishing Gradients with Sigmoid"); plt.xlabel("Epoch"); plt.ylabel("Mean Grad"); plt.legend(); plt.show()
# Fix: ReLU
class DeepReluNet(nn.Module):
def __init__(self, hidden: int = 24, depth: int = 6) -> None:
super().__init__()
self.layers: nn.ModuleList = nn.ModuleList([nn.Linear(2 if i==0 else hidden, hidden) for i in range(depth)])
self.out: nn.Linear = nn.Linear(hidden, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
for l in self.layers:
x = F.relu(l(x))
return self.out(x)
deepnet_r = DeepReluNet()
opt_r = torch.optim.Adam(deepnet_r.parameters(), lr=0.09)
g_hist_r = []
for epoch in range(25):
logits = deepnet_r(X)
loss = F.cross_entropy(logits, y)
opt_r.zero_grad()
loss.backward()
grads = [l.weight.grad.abs().mean().item() for l in deepnet_r.layers]
g_hist_r.append(sum(grads)/len(grads))
opt_r.step()
plt.plot(g_hist, label='Sigmoid'); plt.plot(g_hist_r, label='ReLU')
plt.title("Vanishing Gradients: Sigmoid vs ReLU")
plt.xlabel("Epoch"); plt.ylabel("Mean Grad"); plt.legend(); plt.grid(); plt.show()
Now you:
Next: We’ll discuss overfitting, underfitting, and regularization—essentials for making your models robust on real-world (not just training) data.
Stick with these basics: understanding gradients and backprop is the foundation of all deep learning and RL! See you in Part 3.6!