Welcome to Module 2: Optimization and Learning! You’ve learned how to represent and manipulate data in vector and matrix form. But how do we actually learn from data? The answer is optimization—specifically, using gradients to iteratively improve parameters by minimizing some loss function. This is the backbone of nearly all deep learning and reinforcement learning algorithms.
In this post, you’ll:
Let’s start learning—by making a machine learn!
Gradient descent is a method for finding the local minimum of a function by moving against the gradient (the direction of steepest ascent).
Given a differentiable function , gradient descent updates the parameter using:
where:
For , .
For a function with :
where is the vector of partial derivatives (gradient).
For , .
Let’s do it by hand (without autograd for now).
def grad_fx(x: float) -> float:
# Derivative of f(x) = x^2 is 2x
return 2 * x
x = 5.0 # Start far from zero
eta = 0.1 # Learning rate
trajectory = [x]
for step in range(20):
x = x - eta * grad_fx(x)
trajectory.append(x)
print("Final x:", x)
import numpy as np
import matplotlib.pyplot as plt
# The function and its minimum
def fx(x):
return x**2
# Use trajectory from previous demo
steps = np.array(trajectory)
plt.plot(steps, fx(steps), 'o-', label="Optimization Path")
plt.plot(0, 0, 'rx', markersize=12, label="Minimum")
plt.xlabel('x value')
plt.ylabel('f(x)')
plt.title('Gradient Descent for $f(x) = x^2$')
plt.legend()
plt.grid(True)
plt.show()
import torch
def grad_f_vec(x: torch.Tensor) -> torch.Tensor:
return 2 * x
x: torch.Tensor = torch.tensor([5.0, -3.0], dtype=torch.float32) # Initial point in 2D
eta_vec = 0.2
trajectory_vec = [x.clone()]
for step in range(15):
x = x - eta_vec * grad_f_vec(x)
trajectory_vec.append(x.clone())
trajectory_vec = torch.stack(trajectory_vec)
print("Final x:", x)
print("Norm at end:", torch.norm(x).item())
Let’s try different learning rates and see their effect.
init_x = 5.0
learning_rates = [0.05, 0.2, 0.8, 1.01]
colors = ['b', 'g', 'r', 'orange']
plt.figure()
for lr, col in zip(learning_rates, colors):
x = init_x
hist = [x]
for _ in range(12):
x = x - lr * grad_fx(x)
hist.append(x)
plt.plot(hist, fx(np.array(hist)), 'o-', color=col, label=f'LR={lr}')
plt.plot(0, 0, 'kx', markersize=12)
plt.title('Gradient Descent Paths for Different Learning Rates')
plt.xlabel('x value')
plt.ylabel('f(x)')
plt.legend()
plt.grid(True)
plt.show()
Put your new optimization skills to the test:
import torch
import numpy as np
import matplotlib.pyplot as plt
# EXERCISE 1
def grad_fx_shifted(x: float) -> float:
return 2 * (x - 3)
x = -7.0
eta = 0.2
traj = [x]
for _ in range(20):
x = x - eta * grad_fx_shifted(x)
traj.append(x)
print(f"x = {x:.4f}")
# EXERCISE 2
x_arr = np.array(traj)
plt.plot(x_arr, (x_arr - 3)**2, 'o-')
plt.plot(3, 0, 'rx', label='Minimum')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Optimization Path: Scalar')
plt.legend(); plt.grid(True)
plt.show()
# EXERCISE 3
def grad_fvec_shifted(x: torch.Tensor) -> torch.Tensor:
return 2 * (x - torch.tensor([2.0, -1.0]))
xv = torch.tensor([5.0, 5.0])
eta_vec = 0.1
traj_v = [xv.clone()]
for _ in range(20):
xv = xv - eta_vec * grad_fvec_shifted(xv)
traj_v.append(xv.clone())
traj_v_np = torch.stack(traj_v).numpy()
target = np.array([2.0, -1.0])
plt.plot(traj_v_np[:,0], traj_v_np[:,1], 'o-', label='GD Path')
plt.plot(target[0], target[1], 'rx', label='Minimum')
plt.title('Gradient Descent Path: Vector')
plt.xlabel('x'); plt.ylabel('y')
plt.legend(); plt.grid(True)
plt.show()
# EXERCISE 4
lrs = [0.01, 0.1, 1.0, 1.5]
plt.figure()
for lr in lrs:
x = -7.0
h = [x]
for _ in range(15):
x = x - lr * grad_fx_shifted(x)
h.append(x)
plt.plot(h, [(hx-3)**2 for hx in h], 'o-', label=f'LR={lr}')
plt.plot(3, 0, 'kx', markersize=10, label='Minimum')
plt.legend(); plt.grid(True)
plt.title('Learning Rate Effect: Scalar')
plt.xlabel('x'); plt.ylabel('f(x)')
plt.show()
You’ve stepped into the engine room of all learning systems: optimization via gradient descent.
Next: We’ll dive into automatic differentiation—how PyTorch “automagically” computes gradients for any function you can imagine. This will let you optimize neural networks, RL objectives, and more.
Keep experimenting with functions, rates, and dimensions; mastery of optimization is the key to all modern AI. See you in Part 2.2!