{
"cells": [
{
"cell_type": "markdown",
"source": [
"Some imports just to render the results in colab"
],
"metadata": {
"id": "DvGm0V_AWiEw"
}
},
{
"cell_type": "code",
"source": [
"!pip install swig\n",
"!pip install gymnasium[box2d]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "E05FHRDUNm5Q",
"outputId": "b4803c1e-d2c1-4cd8-846b-ba7d2e9cdd61"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: swig in /usr/local/lib/python3.10/dist-packages (4.2.1)\n",
"Requirement already satisfied: gymnasium[box2d] in /usr/local/lib/python3.10/dist-packages (0.29.1)\n",
"Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (1.25.2)\n",
"Requirement already satisfied: cloudpickle>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (2.2.1)\n",
"Requirement already satisfied: typing-extensions>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (4.10.0)\n",
"Requirement already satisfied: farama-notifications>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (0.0.4)\n",
"Requirement already satisfied: box2d-py==2.3.5 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (2.3.5)\n",
"Requirement already satisfied: pygame>=2.1.3 in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (2.5.2)\n",
"Requirement already satisfied: swig==4.* in /usr/local/lib/python3.10/dist-packages (from gymnasium[box2d]) (4.2.1)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"from gymnasium.wrappers import RecordVideo\n",
"import glob\n",
"import io\n",
"import base64\n",
"from IPython.display import HTML\n",
"from IPython import display\n",
"\n",
"\"\"\"\n",
"Utility functions to enable video recording of gym environment\n",
"and displaying it.\n",
"To enable video, just do \"env = wrap_env(env)\"\"\n",
"\"\"\"\n",
"\n",
"def show_video():\n",
" mp4list = glob.glob('video/*.mp4')\n",
" if len(mp4list) > 0:\n",
" mp4 = mp4list[0]\n",
" video = io.open(mp4, 'r+b').read()\n",
" encoded = base64.b64encode(video)\n",
" display.display(HTML(data=''''''.format(encoded.decode('ascii'))))\n",
" else:\n",
" print(\"Could not find video\")\n",
""
],
"metadata": {
"id": "Q5PGqOlHWjC5"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"We will work on the Lunar-Lander environment:\n",
"\n",
"https://gymnasium.farama.org/environments/box2d/lunar_lander/"
],
"metadata": {
"id": "70l66Noa6ECO"
}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "NaYswgHeLhvq"
},
"outputs": [],
"source": [
"import gymnasium as gym\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import deque\n",
"\n",
"env = gym.make(\"LunarLander-v2\", render_mode=\"human\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "k9w5IYGJLhvt"
},
"source": [
"From the previous notebook, we can obtain the action and state space"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "U_QvDHr8Lhvv",
"outputId": "f2a706da-c7dd-4d95-f67d-993772eda966"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Actions : Discrete(4)\n",
"Variables: Box([-1.5 -1.5 -5. -5. -3.1415927 -5.\n",
" -0. -0. ], [1.5 1.5 5. 5. 3.1415927 5. 1.\n",
" 1. ], (8,), float32)\n",
"Max. var: [1.5 1.5 5. 5. 3.1415927 5. 1.\n",
" 1. ]\n",
"Min. var: [-1.5 -1.5 -5. -5. -3.1415927 -5.\n",
" -0. -0. ]\n"
]
}
],
"source": [
"print('Actions : ',env.action_space)\n",
"print('Variables: ',env.observation_space)\n",
"print('Max. var: ',env.observation_space.high)\n",
"print('Min. var: ',env.observation_space.low)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DH_8llutLhvw"
},
"source": [
"Variables correspond to position in x axis and speed. Actions correspond to forward and backward acceleration respectively.\n",
"\n",
"Let's see performance of random behaviour"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 562
},
"id": "FCqI1P6hLhvw",
"outputId": "214df23f-2473-40a8-f8ae-2dab408bfd77"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/gymnasium/wrappers/record_video.py:94: UserWarning: \u001b[33mWARN: Overwriting existing videos at /content/video folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)\u001b[0m\n",
" logger.warn(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Moviepy - Building video /content/video/rl-video-episode-0.mp4.\n",
"Moviepy - Writing video /content/video/rl-video-episode-0.mp4\n",
"\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": []
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Moviepy - Done !\n",
"Moviepy - video ready /content/video/rl-video-episode-0.mp4\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
""
]
},
"metadata": {}
}
],
"source": [
"env.close()\n",
"env = RecordVideo(gym.make(\"LunarLander-v2\",render_mode='rgb_array'),'video')\n",
"\n",
"observation, _ = env.reset()\n",
"\n",
"for _ in range(300):\n",
" env.render()\n",
" action = env.action_space.sample() # this takes random actions\n",
" observation, reward, terminated , truncated, info = env.step(action)\n",
" if terminated or truncated:\n",
" break\n",
"env.close()\n",
"show_video()"
]
},
{
"cell_type": "markdown",
"source": [
"# Q-learning with DNN\n",
"\n",
"## Some definitions\n",
"\n",
"1. Create the Q Neural Networks and initialize them\n"
],
"metadata": {
"id": "ROr0NfAVO0MH"
}
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"class Qvalue(nn.Module):\n",
" def __init__(self, input, output):\n",
" super(Qvalue, self).__init__()\n",
" self.fc1 = nn.Linear(input, 64)\n",
" self.fc2 = nn.Linear(64, 64)\n",
" self.Q = nn.Linear(64, output)\n",
"\n",
" def forward(self, s):\n",
" x = self.fc1(s)\n",
" x = F.relu(x)\n",
" x = self.fc2(x)\n",
" x = F.relu(x)\n",
" output = self.Q(x)\n",
"\n",
" return output\n",
"\n",
"def init_weights(m):\n",
" if type(m) == nn.Linear:\n",
" nn.init.orthogonal_(m.weight)\n",
"\n",
"def copy_target(target, source):\n",
" for target_param, param in zip(target.parameters(), source.parameters()):\n",
" target_param.data.copy_(param.data)\n",
"\n",
"def soft_update(target_model, local_model, tau):\n",
" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):\n",
" target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)\n",
""
],
"metadata": {
"id": "JHYosPQHO7Aq"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"Q = Qvalue(env.observation_space.shape[0],env.action_space.n).to(device)\n",
"Q_target = Qvalue(env.observation_space.shape[0],env.action_space.n).to(device)\n",
"\n",
"Q.apply(init_weights) #\n",
"\n",
"copy_target(Q_target, Q) #\n",
"\n",
"criterion=nn.MSELoss()\n",
"optimizer=torch.optim.Adam(Q.parameters(),lr=5e-4)\n"
],
"metadata": {
"id": "HG0Wz-PGDgVx"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "3d6nnayNLhv1"
},
"source": [
"2. Definition of the Experience Replay buffer"
]
},
{
"cell_type": "code",
"source": [
"import random\n",
"import numpy as np\n",
"\n",
"class ReplayMemory:\n",
" def __init__(self, capacity, seed):\n",
" random.seed(seed)\n",
" self.capacity = capacity\n",
" self.buffer = []\n",
" self.position = 0\n",
"\n",
" def push(self, state, action, reward, next_state, terminated, truncated):\n",
" if len(self.buffer) < self.capacity:\n",
" self.buffer.append(None)\n",
" self.buffer[self.position] = (state, action, reward, next_state, terminated, truncated)\n",
" self.position = (self.position + 1) % self.capacity\n",
"\n",
" def sample(self, batch_size):\n",
" batch = random.sample(self.buffer, min(len(self.buffer), batch_size))\n",
" state, action, reward, next_state, terminated, truncated = map(np.stack, zip(*batch))\n",
" return state, action, reward, next_state, terminated, truncated\n",
"\n",
" def __len__(self):\n",
" return len(self.buffer)\n",
"\n"
],
"metadata": {
"id": "tacQaa6WkScb"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "7RwwqDVMLhvz"
},
"source": [
"\n",
"Function that implements the epsilon-greedy procedure"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "KfDwNCIDLhv1"
},
"outputs": [],
"source": [
"def e_greedy_policy(Qs):\n",
" return env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(Qs)\n"
]
},
{
"cell_type": "markdown",
"source": [
"Function just to collect performance of *niter* iterations of the greedy plolicy (no exploration)"
],
"metadata": {
"id": "M7vgMaJP8TF8"
}
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "CYbG98YULhv2"
},
"outputs": [],
"source": [
"def rollout(niter):\n",
" G = 0\n",
" for i in range(niter):\n",
" state, _ = env.reset()\n",
" for _ in range(1000):\n",
" Qs = Q(torch.tensor(state).to(device)).detach().cpu().numpy()\n",
" action = np.argmax(Qs)\n",
" next_state, reward, terminated , truncated, info = env.step(action)\n",
" G += reward\n",
" if truncated or terminated: break\n",
" state = next_state\n",
" return G/niter\n"
]
},
{
"cell_type": "markdown",
"source": [
"## Start the learning"
],
"metadata": {
"id": "BQVAZrpS8Iwr"
}
},
{
"cell_type": "code",
"source": [
"env.close()\n",
"env = gym.make(\"LunarLander-v2\")\n"
],
"metadata": {
"id": "C4tmSE9G3u1X"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ER = ReplayMemory(int(1e6),1)"
],
"metadata": {
"id": "p15LxZML7jES"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Mw6qKMb3Lhv3",
"outputId": "7c5655d4-afa3-47be-814b-77aa030b193a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" episode: 99 avg reward: -104.66 epsilon: 0.61 steps 9804\n",
" episode: 199 avg reward: -22.83 epsilon: 0.37 steps 47752\n",
" episode: 299 avg reward: 57.36 epsilon: 0.22 steps 125610\n",
" episode: 399 avg reward: 151.07 epsilon: 0.14 steps 200650\n",
" episode: 499 avg reward: 212.47 epsilon: 0.08 steps 254516\n",
" episode: 599 avg reward: 188.89 epsilon: 0.05 steps 305507\n",
" episode: 699 avg reward: 208.04 epsilon: 0.03 steps 351906\n",
" episode: 700 avg reward: 208.08\n",
" elapsed 20.92 minutes\n"
]
}
],
"source": [
"import time\n",
"t = time.time()\n",
"\n",
"# Parameters of learning\n",
"gamma = 0.99\n",
"numEpisodes = 800 # 1000\n",
"epsilon = 1\n",
"\n",
"# Variables to collect scores of 100 episodes\n",
"rewardTracker = [] # with exploration\n",
"scores_window = deque(maxlen=100)\n",
"all_rewards=[]\n",
"\n",
"# NN parameters\n",
"batch_size=64\n",
"steps=0\n",
"for episodeNum in range(1,numEpisodes+1):\n",
" G = 0\n",
" state, _ = env.reset()\n",
"\n",
" while True:\n",
"\n",
" # Collect data\n",
" with torch.no_grad():\n",
" Qs = Q(torch.tensor(state).to(device)).detach().cpu().numpy()\n",
" action = e_greedy_policy(Qs) # select action with epsilon-greedy procedure\n",
" next_state, reward, terminated , truncated, info = env.step(action)\n",
" ER.push(state, action, reward, next_state, terminated, truncated)\n",
" G= G+reward\n",
" state = next_state\n",
" steps=steps+1\n",
"\n",
" # Train\n",
" if batch_size<=len(ER):\n",
" # Get batch from exprience replay\n",
" state_batch, action_batch, reward_batch, next_state_batch, terminated_batch, truncated_batch = ER.sample(batch_size)\n",
" state_batch = torch.FloatTensor(state_batch).to(device)\n",
" action_batch = torch.Tensor(action_batch).to(dtype=torch.long).to(device).unsqueeze(1)\n",
" next_state_batch = torch.FloatTensor(next_state_batch).to(device)\n",
" reward_batch = torch.FloatTensor(reward_batch).to(device).unsqueeze(1)\n",
" terminated_batch = torch.FloatTensor(terminated_batch).to(dtype=torch.long).to(device).unsqueeze(1)\n",
" truncated_batch = torch.FloatTensor(truncated_batch).to(dtype=torch.long).to(device).unsqueeze(1)\n",
"\n",
" #q_targets_next = torch.gather(Q_target(next_state_batch).detach(),1,torch.argmax(Q(next_state_batch).detach(),dim=1,keepdim = True)) # Double Q-learning\n",
" q_targets_next = torch.max(Q_target(next_state_batch).detach(),dim=1,keepdim = True)[0] # Standard DQN\n",
"\n",
" #target = reward_batch+ (1-(truncated_batch + terminated_batch)) *gamma*q_targets_next\n",
" target = reward_batch+ (1-(terminated_batch)) *gamma*q_targets_next\n",
"\n",
" q_expected = torch.gather(Q(state_batch),1,action_batch)\n",
"\n",
" loss = criterion(q_expected, target) #MSE\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" soft_update(Q_target, Q, 1e-3)\n",
"\n",
" if truncated or terminated: break\n",
"\n",
" scores_window.append(G)\n",
" all_rewards.append(G)\n",
" epsilon = max(0.01, 0.995*epsilon)\n",
" print('\\r episode:',episodeNum,'avg reward:', np.round(np.mean(scores_window),2),end='')\n",
"\n",
" if episodeNum%100==99:\n",
" rewardTracker.append(np.mean(scores_window))\n",
" print('\\r episode:',episodeNum,'avg reward:', np.round(np.mean(scores_window),2), 'epsilon:', np.round(epsilon,2),'steps',steps)\n",
"\n",
"elapsed = time.time() - t\n",
"\n",
"print('\\n elapsed', np.round(elapsed/60,2),'minutes')\n"
]
},
{
"cell_type": "markdown",
"source": [
"See plots of learning. All rewards and average of 100 experiments"
],
"metadata": {
"id": "CPXWxcla7qHE"
}
},
{
"cell_type": "code",
"source": [
"plt.plot(all_rewards)\n",
"plt.plot(list(range(50,(len(rewardTracker)+1)*100-50,100)),rewardTracker,color='r')\n",
"plt.show()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 430
},
"id": "1m7s5chI7VyW",
"outputId": "8b7ac6d6-2516-4007-8be0-687a9d408aca"
},
"execution_count": 15,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"