All of reinforcement learning is built on the formal concept of a Markov Decision Process (MDP). MDPs provide the mathematical framework underlying RL algorithms—from basic tabular Q-learning to modern deep RL.
In this post, you will:
An MDP is defined by the tuple :
Agent-Environment Loop: At each timestep : The agent picks action in state , the environment returns reward and the next state according to and .
Policy: gives the probability the agent chooses action in state .
Expected Return:
By implementing an MDP in Python, you can simulate any RL task with known ground truth and test various policies systematically.
We’ll define a tiny 3-state MDP with 2 actions.
from typing import Dict, Tuple, List
import numpy as np
states: List[str] = ['A', 'B', 'C']
actions: List[str] = ['left', 'right']
# Transition probabilities: P[s][a] = list of (prob, next_state)
P: Dict[str, Dict[str, List[Tuple[float, str]]]] = {
'A': {'left': [(1.0, 'A')],
'right': [(1.0, 'B')]},
'B': {'left': [(0.8, 'A'), (0.2, 'C')],
'right': [(1.0, 'C')]},
'C': {'left': [(1.0, 'B')],
'right': [(1.0, 'C')]}
}
# Rewards: R[s][a][s'] = reward
R: Dict[str, Dict[str, Dict[str, float]]] = {
'A': {'left': {'A': 0.0},
'right': {'B': 1.0}},
'B': {'left': {'A': 0.0, 'C': 2.0},
'right': {'C': 3.0}},
'C': {'left': {'B': 0.0},
'right': {'C': 0.0}}
}
gamma: float = 0.9
Let’s run episodes under a random policy (choose actions uniformly in each state).
import random
def sample_next_state(s: str, a: str) -> str:
p_list = P[s][a]
probs, next_states = zip(*[(p, s2) for p, s2 in p_list])
return random.choices(next_states, weights=probs)[0]
def get_reward(s: str, a: str, s2: str) -> float:
return R[s][a][s2]
def run_episode(policy: Dict[str, str | None] = None, max_steps: int = 10) -> float:
s = 'A'
total_reward = 0.0
trajectory = []
for t in range(max_steps):
a = policy[s] if policy and policy[s] else random.choice(actions)
s2 = sample_next_state(s, a)
r = get_reward(s, a, s2)
total_reward += r
trajectory.append((s, a, r, s2))
s = s2
print("Trajectory:", trajectory)
print("Total reward:", total_reward)
return total_reward
ep_rewards = [run_episode() for _ in range(5)]
print("Average reward:", np.mean(ep_rewards))
We’ll use networkx
to show how each (state, action) leads to other states.
import networkx as nx
import matplotlib.pyplot as plt
G = nx.MultiDiGraph()
for s in P:
for a in P[s]:
for prob, s2 in P[s][a]:
label = f"{a} ({prob})"
G.add_edge(s, s2, label=label)
pos = nx.spring_layout(G, seed=1)
plt.figure(figsize=(5,4))
nx.draw(G, pos, with_labels=True, node_size=1200, node_color="skyblue", font_weight="bold")
edge_labels = {(u,v,k['label']):k['label'] for u,v,k in G.edges(data=True)}
edge_labels = {(u,v): d['label'] for u,v,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9)
plt.title("MDP State Transition Graph")
plt.show()
Let’s try a greedy policy: always take "right"
.
# Define a deterministic policy
simple_policy = {s: "right" for s in states}
# Run several episodes and compute average reward
rewards_right = [run_episode(simple_policy) for _ in range(20)]
print("Right-only policy: avg reward =", np.mean(rewards_right))
# Try left-only for contrast
left_policy = {s: "left" for s in states}
rewards_left = [run_episode(left_policy) for _ in range(20)]
print("Left-only policy: avg reward =", np.mean(rewards_left))
networkx
(or matplotlib) to draw the states and arrows for transitions,
labelling by action and probability.import random
from typing import Dict, Tuple, List
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
# Exercise 1
states = ['A', 'B', 'C']
actions = ['left', 'right']
P = {
'A': {'left': [(1.0, 'A')], 'right': [(1.0, 'B')]},
'B': {'left': [(0.8, 'A'), (0.2, 'C')], 'right': [(1.0, 'C')]},
'C': {'left': [(1.0, 'B')], 'right': [(1.0, 'C')]}
}
R = {
'A': {'left': {'A': 0.0}, 'right': {'B': 1.0}},
'B': {'left': {'A': 0.0, 'C': 2.0}, 'right': {'C': 3.0}},
'C': {'left': {'B': 0.0}, 'right': {'C': 0.0}}
}
def sample_next_state(s,a):
return random.choices([ns for _,ns in P[s][a]], [p for p,_ in P[s][a]])[0]
def get_reward(s,a,s2): return R[s][a][s2]
# Exercise 2
def run_episode(policy=None, max_steps=8):
s = 'A'; total = 0
for t in range(max_steps):
a = policy[s] if policy else random.choice(actions)
s2 = sample_next_state(s,a)
r = get_reward(s,a,s2)
print(f't={t}, s={s}, a={a}, r={r}, s\'={s2}')
total += r
s = s2
print('Total reward:', total)
run_episode()
# Exercise 3
G = nx.MultiDiGraph()
for s in P:
for a in P[s]:
for prob,s2 in P[s][a]:
lbl = f"{a} ({prob})"
G.add_edge(s, s2, label=lbl)
pos = nx.spring_layout(G, seed=2)
plt.figure(figsize=(5,4))
nx.draw(G, pos, with_labels=True, node_size=1300, node_color='orange')
edge_labels = {(u,v):d['label'] for u,v,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.title("State Transitions"); plt.show()
# Exercise 4
right_policy = {s: 'right' for s in states}
left_policy = {s: 'left' for s in states}
rewards_right = [run_episode(right_policy) for _ in range(10)]
rewards_left = [run_episode(left_policy) for _ in range(10)]
print("Avg reward right:", np.mean(rewards_right))
print("Avg reward left:", np.mean(rewards_left))
You now know how to represent the key elements of RL—the MDP—in code and visually! You can simulate random and fixed-action policies, trace how rewards accumulate, and see the literal map of your agent’s world. Next, you’ll tackle bandit problems and step further into the heart of RL: balancing exploration and exploitation.
See you in Part 4.3!