From f1b7a58e705ea300c2622389ee7b2b61e8d2b13c Mon Sep 17 00:00:00 2001
From: mGalarnyk <mgalarny@gmail.com>
Date: Fri, 30 Apr 2021 14:40:18 -0700
Subject: [PATCH] Added some text comments that you may wish to consider.

---
 .DS_Store                               |  Bin 0 -> 8196 bytes
 ray_summit_2021/TestingSven.ipynb       | 1154 +++++++++++++++++++++++
 ray_summit_2021/tutorial_notebook.ipynb |  235 ++++-
 3 files changed, 1353 insertions(+), 36 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 ray_summit_2021/TestingSven.ipynb

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a32dfee46b1e7bdc56281e5a92d4413ca6b8f67e
GIT binary patch
literal 8196
zcmeHMTWl0n82<mZz|4U3l#6r-iwi}dP;g6uQZBN+K?@}%bfGPjF0(raI&nHvW@fh_
zl2q{)V=!J46H#Lj^#Kx}G)jC@RE);NNQ|23gTDA^G?@6L|Cuu_7HAqnq9n}Oob#Xg
zZ|BT+zL~RU7XYxSVAcXu0)RqQNH(3Cn-pOe^@dUsY05|<#RK?IfI<3L5ZvG$D`Es<
z1Y!hY1Y!hY1a1ceXwMc!nPA_S*0_xkh!MCW5fJZ(Bvm2PksOgSe05M09sx*-BS2WF
zPv?LzCK<_eBuAtSLn)^`Jzx}y(Gde>IN6iJon$(aBT~w6Kp74gjf~L=1;f!PE*ZK5
zrlpMA7=ajp8xatXPdRvCz;VcqJHKaP4Wy{n=7#;DobBew^J;F$w)`NK`W8z{%ce}7
zrl{I<Rh_A-2fGH`;jX|43VxH}W!Zu*%j7x3>$c2bAj6$jn-{s$>2(+Z_YMf>jO*Iu
zAp1scpT&o>Y)-T51cv2sFU%)%^mSW~=?*u!1;_M5|0sFb-)#f~lt-yb9~ueoPgRsG
zt2}X{wxPBrS=+GgWKHr!eSNAXxwdZo$&-qr=*zb4=`TET`lU1H-gxt^4?hZ{0HZex
zRLJYG_7e3bfynlY%@U%L7jFiU-o8WY$g){Qy2;L=z;YdbANPEcSyt{780~r2+3j&*
z@6Nk+VbJljtRipN`GQS|HQIKU^&t0q0)8wIz5TxH1<j#jma&4*_i*yFioAuq{T_?b
z2{(r7`5AK)i<hoUt=qV@<;q<%%bBLlp3_~ld@E=3Uc=9G$F!UydxynB?I*yUz&dPk
zPgN`Mc*ZdX`IWopRWg0PzF;&;`d*7wSe4RK$XQ1VR&ZR7p(huq3VWx|r0MK(O_9+I
zExMk(S5XcKeY3}@m1<3|UO@py#-~|F(_DR@sxm+G04=@@)#}zNDl_-mMvmK&YC{95
zc6JvfnvrtL{R+*#rYG+X4$}SjQmr>ND+;^ZFUIaL{GfA)J2ZTFcS(J_qVx(&qiJ%J
z$h2Ipe~5rul}<%;P(X*(v?sMeA8;6gQ}8sLhG*amoP!JSE_?u=!e#gjzJRabd-w@{
zf#2YFxC(#3U#Osp8qPo+7h@GJ!4x*&M%;v3unBi!2kyd7%-})n$1ECn1c&e_dKlnC
z_!vHpPvBGdJid&t;H&r=zK!qU`*;yA;m7zTevMb;{7Obiyepb-B=867OitqHIWOtI
z8Hp3E+qAa-hs58_5X`NZHG9t7g;mQ}t*PJKIJUUgt>-_f6|zavC8W_PO~OST&rM>1
zK3`jq*h_5V$F6Z(a($HfY?|PeA~>VplUS5sf>82uNn&Y&l@RCUWp!eu#!89NYUsRL
zOC{J8BCoX8Y3nprM(mc>4cf*8n@T*E)-Bprje&Unq*+ZY)fDyLbMh>lhl}tDd`?{a
z27ZK};aB*Rc&K7I&LbAi#{@3KWmt_Xum)G*dfY&a+=QF46}Mv>?jTlXa36MKFYd=Y
zay*O!XoZ|C;4mJ;hw&6XO3Zu`pT*~hoi7kO&*JOXak2e6E?$_Ni^VR_v0di~d5$uA
ze3JO;))6R`DJsSJzjfl@|8ISuI75s;jKFP+0G73MwKUQ6CVJJ1vv!i|A*!MXzY!_J
s5Nd>r(2MhgkYj%sQa>5#(vcjIk}OpI^B)4}OLx5g$NPUszB{=28!Pr1_W%F@

literal 0
HcmV?d00001

diff --git a/ray_summit_2021/TestingSven.ipynb b/ray_summit_2021/TestingSven.ipynb
new file mode 100644
index 0000000..e89ac75
--- /dev/null
+++ b/ray_summit_2021/TestingSven.ipynb
@@ -0,0 +1,1154 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e18b1a68",
+   "metadata": {},
+   "source": [
+    "# Hands-on RL with Ray’s RLlib\n",
+    "## A beginner’s tutorial for working with multi-agent environments, models, and algorithms\n",
+    "\n",
+    "### Overview\n",
+    "“Hands-on RL with Ray’s RLlib” is a beginners tutorial for working with reinforcement learning (RL) environments, models, and algorithms using Ray’s RLlib library. RLlib offers high scalability, a large list of algos to choose from (offline, model-based, model-free, etc..), support for TensorFlow and PyTorch, and a unified API for a variety of applications. This tutorial includes a brief introduction to provide an overview of concepts (e.g. why RL) before proceeding to RLlib (multi- and single-agent) environments, neural network models, hyperparameter tuning, debugging, student exercises, Q/A, and more. All code will be provided as .py files in a GitHub repo.\n",
+    "\n",
+    "### Intended Audience\n",
+    "* Python programmers who want to get started with reinforcement learning and RLlib.\n",
+    "\n",
+    "### Prerequisites\n",
+    "* Some Python programming experience.\n",
+    "* Some familiarity with machine learning.\n",
+    "* *Helpful, but not required:* Experience in reinforcement learning and Ray.\n",
+    "* *Helpful, but not required:* Experience with TensorFlow or PyTorch.\n",
+    "\n",
+    "### Requirements/Dependencies\n",
+    "\n",
+    "Install conda (https://www.anaconda.com/products/individual)\n",
+    "\n",
+    "Then ...\n",
+    "\n",
+    "#### Quick `conda` setup instructions (Linux):\n",
+    "\n",
+    "```\n",
+    "$ conda create -n rllib python=3.8\n",
+    "$ conda activate rllib\n",
+    "$ pip install \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
+    "```\n",
+    "\n",
+    "#### Quick `conda` setup instructions (Mac):\n",
+    "\n",
+    "cmake https://github.com/actions/setup-python/issues/121\n",
+    "\n",
+    "```\n",
+    "$ conda create -n rllib python=3.8\n",
+    "$ conda activate rllib\n",
+    "$ pip install cmake \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
+    "```\n",
+    "\n",
+    "#### Quick `conda` setup instructions (Win10):\n",
+    "```\n",
+    "$ conda create -n rllib python=3.8\n",
+    "$ conda activate rllib\n",
+    "$ pip install \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
+    "$ conda install pywin32\n",
+    "```\n",
+    "\n",
+    "Also, for Win10 Atari support, we have to install atari_py from a different source (gym does not support Atari envs on Windows).\n",
+    "\n",
+    "```\n",
+    "$ pip install git+https://github.com/Kojoley/atari-py.git\n",
+    "```\n",
+    "\n",
+    "### Opening these tutorial files:\n",
+    "```\n",
+    "$ git clone https://github.com/sven1977/rllib_tutorials\n",
+    "$ cd rllib_tutorials\n",
+    "$ jupyter-lab\n",
+    "```\n",
+    "\n",
+    "### Key Takeaways\n",
+    "* What is reinforcement learning and why RLlib?\n",
+    "* Core concepts of RLlib: Environments, Trainers, Policies, and Models.\n",
+    "* How to configure, hyperparameter-tune, and parallelize RLlib.\n",
+    "* RLlib debugging best practices.\n",
+    "\n",
+    "### Tutorial Outline\n",
+    "1. RL and RLlib in a nutshell.\n",
+    "1. Defining an RL-solvable problem: Our first environment.\n",
+    "1. Exercise No.1 (env loop)\n",
+    "1. Picking an algorithm and training our first RLlib Trainer.\n",
+    "1. Configurations and hyperparameters - Easy tuning with Ray Tune.\n",
+    "1. Fixing our experiment's config - Going multi-agent.\n",
+    "1. The \"infinite laptop\": Quick intro into how to use RLlib with Anyscale's product.\n",
+    "1. Exercise No.2 (run your own Ray RLlib+Tune experiment)\n",
+    "1. Neural network models - Provide your custom models using tf.keras or torch.nn.\n",
+    "1. Deeper dive into RLlib's parallelization architecture.\n",
+    "1. Specifying different compute resources and parallelization options through our config.\n",
+    "1. \"Hacking in\": Using callbacks to customize the RL loop and generate our own metrics.\n",
+    "1. Exercise No.3 (write your own custom callback)\n",
+    "1. \"Hacking in (part II)\" - Debugging with RLlib and PyCharm.\n",
+    "1. Checking on the \"infinite laptop\" - Did RLlib learn to solve the problem?\n",
+    "\n",
+    "### Other Recommended Readings\n",
+    "* [Attention Nets and More with RLlib's Trajectory View API](https://medium.com/distributed-computing-with-ray/attention-nets-and-more-with-rllibs-trajectory-view-api-d326339a6e65)\n",
+    "* [Intro to RLlib: Example Environments](https://medium.com/distributed-computing-with-ray/intro-to-rllib-example-environments-3a113f532c70)\n",
+    "* [Reinforcement Learning with RLlib in the Unity Game Engine](https://medium.com/distributed-computing-with-ray/reinforcement-learning-with-rllib-in-the-unity-game-engine-1a98080a7c0d)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0b5d3fb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/michaelgalarnyk/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/autoscaler/_private/cli_logger.py:57: FutureWarning: Not all Ray CLI dependencies were found. In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will only be usable via `pip install 'ray[default]'`. Please update your install command.\n",
+      "  warnings.warn(\n",
+      "2021-04-30 14:12:53,514\tINFO services.py:1267 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'node_ip_address': '192.168.4.37',\n",
+       " 'raylet_ip_address': '192.168.4.37',\n",
+       " 'redis_address': '192.168.4.37:6379',\n",
+       " 'object_store_address': '/tmp/ray/session_2021-04-30_14-12-52_489500_78109/sockets/plasma_store',\n",
+       " 'raylet_socket_name': '/tmp/ray/session_2021-04-30_14-12-52_489500_78109/sockets/raylet',\n",
+       " 'webui_url': '127.0.0.1:8265',\n",
+       " 'session_dir': '/tmp/ray/session_2021-04-30_14-12-52_489500_78109',\n",
+       " 'metrics_export_port': 61257,\n",
+       " 'node_id': 'b9fa92a7843d5fbdabd1f9d39eb907a9a578d32eda9e4586e70d20e5'}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import ray\n",
+    "\n",
+    "# Start a new instance of Ray or connect to an already running one.\n",
+    "ray.init()\n",
+    "# In case you encounter this error during our tutorial:\n",
+    "# RuntimeError: Maybe you called ray.init twice by accident?\n",
+    "# Try: ray.shutdown() or ray.init(ignore_reinit_error=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0cebfd39",
+   "metadata": {},
+   "source": [
+    "<img src=\"images/rl-cycle.png\" width=1200>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e48acf3",
+   "metadata": {},
+   "source": [
+    "### Coding/defining our \"problem\" via an RL environment.\n",
+    "\n",
+    "We will use the following (adversarial) multi-agent environment\n",
+    "throughout this tutorial to demonstrate a large fraction of RLlib's\n",
+    "APIs, features, and customization options."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3ba1299",
+   "metadata": {},
+   "source": [
+    "<img src=\"images/environment.png\" width=800>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b67fd69",
+   "metadata": {},
+   "source": [
+    "### A word or two on Spaces:\n",
+    "\n",
+    "Spaces are used in ML to describe what possible/valid values inputs and outputs of a neural network can have.\n",
+    "\n",
+    "RL environments also use them to describe what their valid observations and actions are.\n",
+    "\n",
+    "Spaces are usually defined by their shape (e.g. 84x84x3 RGB images) and datatype (e.g. uint8 for RGB values between 0 and 255).\n",
+    "However, spaces could also be composed of other spaces (see Tuple or Dict spaces) or could be simply discrete with n fixed possible values\n",
+    "(represented by integers). For example, in our game, where each agent can only go up/down/left/right, the action space would be \"Discrete(4)\"\n",
+    "(no datatype, no shape needs to be defined here).\n",
+    "\n",
+    "<b>Sven:Say something like \"the code below that will create our space (or whatever you think best, just so people know what the code is explicitly for). It can literally be one sentence.</b> "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2d99764",
+   "metadata": {},
+   "source": [
+    "<img src=\"images/spaces.png\" width=800>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7459e5d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gym\n",
+    "from gym.spaces import Discrete, MultiDiscrete\n",
+    "import random\n",
+    "\n",
+    "from ray.rllib.env.multi_agent_env import MultiAgentEnv\n",
+    "\n",
+    "class MultiAgentArena(MultiAgentEnv):\n",
+    "    def __init__(self, config=None):\n",
+    "        # !LIVE CODING!\n",
+    "        config = config or {}\n",
+    "        self.width = config.get(\"width\", 10)\n",
+    "        self.height = config.get(\"height\", 10)\n",
+    "\n",
+    "        # 0=up, 1=right, 2=down, 3=left.\n",
+    "        self.action_space = Discrete(4)\n",
+    "        self.observation_space = MultiDiscrete([self.width * self.height,\n",
+    "                                                self.width * self.height])\n",
+    "        # End an episode after this many timesteps.\n",
+    "        self.timestep_limit = config.get(\"ts\", 100)\n",
+    "        # Reset env.\n",
+    "        self.reset()\n",
+    "\n",
+    "    def reset(self):\n",
+    "        # !LIVE CODING!\n",
+    "        # Row-major coords.\n",
+    "        self.agent1_pos = [0, 0]\n",
+    "        self.agent2_pos = [self.height - 1, self.width - 1]\n",
+    "        # Reset agent1's visited states.\n",
+    "        self.agent1_visited_states = set()\n",
+    "        # How many timesteps have we done in this episode.\n",
+    "        self.timesteps = 0\n",
+    "\n",
+    "        return self.get_obs()\n",
+    "\n",
+    "    def step(self, action: dict):\n",
+    "        # !LIVE CODING!\n",
+    "        self.timesteps += 1\n",
+    "        # Determine, who is allowed to move first.\n",
+    "        agent1_first = random.random() > 0.5\n",
+    "        # Move first agent (could be agent 1 or 2).\n",
+    "        if agent1_first:\n",
+    "            r1, r2 = self.move(self.agent1_pos, action[\"agent1\"], is_agent1=True)\n",
+    "            add = self.move(self.agent2_pos, action[\"agent2\"], is_agent1=False)\n",
+    "        else:\n",
+    "            r1, r2 = self.move(self.agent2_pos, action[\"agent2\"], is_agent1=False)\n",
+    "            add = self.move(self.agent1_pos, action[\"agent1\"], is_agent1=True)\n",
+    "        r1 += add[0]\n",
+    "        r2 += add[1]\n",
+    "\n",
+    "        obs = self.get_obs()\n",
+    "\n",
+    "        reward = {\"agent1\": r1, \"agent2\": r2}\n",
+    "\n",
+    "        done = self.timesteps >= self.timestep_limit\n",
+    "        done = {\"agent1\": done, \"agent2\": done, \"__all__\": done}\n",
+    "\n",
+    "        return obs, reward, done, {}\n",
+    "\n",
+    "    def get_obs(self):\n",
+    "        ag1_discrete_pos = self.agent1_pos[0] * self.width + \\\n",
+    "            (self.agent1_pos[1] % self.width)\n",
+    "        ag2_discrete_pos = self.agent2_pos[0] * self.width + \\\n",
+    "            (self.agent2_pos[1] % self.width)\n",
+    "        return {\n",
+    "            \"agent1\": np.array([ag1_discrete_pos, ag2_discrete_pos]),\n",
+    "            \"agent2\": np.array([ag2_discrete_pos, ag1_discrete_pos]),\n",
+    "        }\n",
+    "\n",
+    "    def move(self, coords, action, is_agent1):\n",
+    "        orig_coords = coords[:]\n",
+    "        # Change the row: 0=up (-1), 2=down (+1)\n",
+    "        coords[0] += -1 if action == 0 else 1 if action == 2 else 0\n",
+    "        # Change the column: 1=right (+1), 3=left (-1)\n",
+    "        coords[1] += 1 if action == 1 else -1 if action == 3 else 0\n",
+    "\n",
+    "        # Solve collisions.\n",
+    "        # Make sure, we don't end up on the other agent's position.\n",
+    "        # If yes, don't move (we are blocked).\n",
+    "        if (is_agent1 and coords == self.agent2_pos) or (not is_agent1 and coords == self.agent1_pos):\n",
+    "            coords[0], coords[1] = orig_coords\n",
+    "            # Agent2 blocked agent1 (agent1 tried to run into agent2)\n",
+    "            # OR Agent2 bumped into agent1 (agent2 tried to run into agent1)\n",
+    "            # -> +1 for agent2; -1 for agent1\n",
+    "            return -1.0, 1.0\n",
+    "\n",
+    "        # No agent blocking -> check walls.\n",
+    "        if coords[0] < 0:\n",
+    "            coords[0] = 0\n",
+    "        elif coords[0] >= self.height:\n",
+    "            coords[0] = self.height - 1\n",
+    "        if coords[1] < 0:\n",
+    "            coords[1] = 0\n",
+    "        elif coords[1] >= self.width:\n",
+    "            coords[1] = self.width - 1\n",
+    "\n",
+    "        # If agent1 -> +1.0 if new tile covered.\n",
+    "        if is_agent1 and not tuple(coords) in self.agent1_visited_states:\n",
+    "            self.agent1_visited_states.add(tuple(coords))\n",
+    "            return 1.0, -0.1\n",
+    "        # No new tile for agent1 -> Negative reward.\n",
+    "        return -0.5, -0.1\n",
+    "\n",
+    "    # Optionally: Add `render` method returning some img.\n",
+    "    def render(self, mode=None):\n",
+    "        return np.random.randint(0, 256, (20, 20, 3), dtype=np.uint8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca35fd02",
+   "metadata": {},
+   "source": [
+    "## Exercise No 1\n",
+    "\n",
+    "<hr />\n",
+    "\n",
+    "Write an \"environment loop\" using our `MultiAgentArena` class.\n",
+    "\n",
+    "1. Create an env object.\n",
+    "1. `reset` your environment to get the first (initial) observation.\n",
+    "1. `step` through the environment using a provided\n",
+    "   \"DummyTrainer.compute_action([obs])\" method to compute action dicts (see cell below, in which you can create a DummyTrainer object and query it for random actions).\n",
+    "1. When an episode is done, remember to `reset()` your environment before the next call to `step()`.\n",
+    "1. If you feel, this is way too easy for you ;) , try to extract each agent's reward, sum it up over one episode and - at the end of an episode (when done=True) - print out each agent's accumulated reward (also called \"return\").\n",
+    "\n",
+    "**Good luck! :)**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2e0154dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'agent1': 0, 'agent2': 0}\n",
+      "{'agent1': 0, 'agent2': 3}\n",
+      "{'agent1': 0, 'agent2': 0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "class DummyTrainer:\n",
+    "    \"\"\"Dummy Trainer class used in Exercise #1.\n",
+    "\n",
+    "    Use its `compute_action` method to get a new action, given some environment\n",
+    "    observation.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def compute_action(self, obs):\n",
+    "        # Returns a random action.\n",
+    "        return {\n",
+    "            \"agent1\": np.random.randint(4),\n",
+    "            \"agent2\": np.random.randint(4)\n",
+    "        }\n",
+    "\n",
+    "dummy_trainer = DummyTrainer()\n",
+    "# Check, whether it's working.\n",
+    "for _ in range(3):\n",
+    "    print(dummy_trainer.compute_action({\"agent1\": np.array([0, 10]), \"agent2\": np.array([10, 0])}))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a7518ad",
+   "metadata": {},
+   "source": [
+    "<b>Sven: As I am sure you know, you will need to explain why env.render is not super reliable or problems people can encounter depending on their operating system. You will however definitely need to run the optional code before commenting it out yourself.</b>\n",
+    "\n",
+    "simple_image_viewer.imshow might not work on your system due to x or y. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fb5880f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Episode done. R1=-46.0 R2=-19.99999999999996\n",
+      "Episode done. R1=-42.5 R2=-17.79999999999996\n",
+      "Episode done. R1=-49.0 R2=-19.99999999999996\n",
+      "Episode done. R1=-30.0 R2=-18.899999999999963\n",
+      "Episode done. R1=-34.0 R2=-13.399999999999968\n",
+      "Episode done. R1=-57.0 R2=-18.899999999999963\n",
+      "Episode done. R1=-55.0 R2=-19.99999999999996\n",
+      "Episode done. R1=-45.5 R2=-14.499999999999963\n",
+      "Episode done. R1=-61.5 R2=-15.599999999999968\n",
+      "Episode done. R1=-74.0 R2=-11.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Solution to Exercise #1:\n",
+    "from gym.envs.classic_control.rendering import SimpleImageViewer\n",
+    "simple_image_viewer = SimpleImageViewer()\n",
+    "\n",
+    "# Solution:\n",
+    "env = MultiAgentArena(config={\"width\": 10, \"height\": 10})\n",
+    "obs = env.reset()\n",
+    "# Play through a single episode.\n",
+    "done = {\"__all__\": False}\n",
+    "return_ag1 = return_ag2 = 0.0\n",
+    "num_episodes = 0\n",
+    "while num_episodes < 10:\n",
+    "    action = dummy_trainer.compute_action(obs)\n",
+    "    obs, rewards, done, _ = env.step(action)\n",
+    "    return_ag1 += rewards[\"agent1\"]\n",
+    "    return_ag2 += rewards[\"agent2\"]    \n",
+    "    if done[\"__all__\"]:\n",
+    "        print(f\"Episode done. R1={return_ag1} R2={return_ag2}\")\n",
+    "        num_episodes += 1\n",
+    "        return_ag1 = return_ag2 = 0.0\n",
+    "        obs = env.reset()\n",
+    "    # Optional:\n",
+    "    #img = env.render()\n",
+    "    #simple_image_viewer.imshow(img)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "187a66e4",
+   "metadata": {},
+   "source": [
+    "<b>Sven: Say something like \"In the code below, you will get an error if you only have PyTorch installed. Be sure to uncomment the \"framework\" line.\"</b>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "21c62133",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-04-30 14:15:03,745\tWARNING util.py:53 -- Install gputil for GPU system monitoring.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 4) Plugging in RLlib.\n",
+    "\n",
+    "# Import a Trainable (one of RLlib's built-in algorithms):\n",
+    "# We use the PPO algorithm here b/c its very flexible wrt its supported\n",
+    "# action spaces and model types and b/c it learns well almost any problem.\n",
+    "from ray.rllib.agents.ppo import PPOTrainer\n",
+    "\n",
+    "# Specify a very simple config, defining our environment and some environment\n",
+    "# options (see environment.py).\n",
+    "config = {\n",
+    "    \"env\": MultiAgentArena,\n",
+    "    \"env_config\": {\n",
+    "        \"config\": {\n",
+    "            \"width\": 10,\n",
+    "            \"height\": 10,\n",
+    "        },\n",
+    "    },\n",
+    "    \"framework\": \"torch\",\n",
+    "    \"create_env_on_driver\": True,\n",
+    "}\n",
+    "# Instantiate the Trainer object using above config.\n",
+    "rllib_trainer = PPOTrainer(config=config)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec4cd770",
+   "metadata": {},
+   "source": [
+    "That's it, we are ready to train.\n",
+    "Calling `train` once runs a single \"training iteration\". One iteration for most algos contains a) sampling from the environment(s) + b) using the sampled data (observations, actions taken, rewards) to update the policy model (neural network), such that it would pick better actions in the future, leading to higher rewards."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "43385b67",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-04-30 14:17:26,034\tWARNING deprecation.py:33 -- DeprecationWarning: `SampleBatch.data[..]` has been deprecated. Use `SampleBatch[..]` instead. This will raise an error in the future!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'episode_reward_max': -52.500000000000135, 'episode_reward_min': -100.50000000000016, 'episode_reward_mean': -73.68000000000012, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-78.00000000000011, -63.00000000000006, -60.30000000000009, -84.30000000000015, -70.50000000000017, -97.50000000000017, -69.00000000000007, -76.80000000000015, -100.50000000000016, -87.00000000000013, -60.600000000000115, -55.5000000000001, -75.00000000000009, -65.70000000000005, -52.500000000000135, -61.500000000000085, -72.00000000000011, -78.90000000000012, -75.00000000000009, -90.00000000000014], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.11413557069761296, 'mean_inference_ms': 0.6737990098280625, 'mean_action_processing_ms': 0.039526632615736315, 'mean_env_wait_ms': 0.020714311094789002, 'mean_env_render_ms': 0.0}, 'off_policy_estimator': {}, 'num_healthy_workers': 2, 'timesteps_total': 4000, 'agent_timesteps_total': 4000, 'timers': {'sample_time_ms': 880.489, 'sample_throughput': 4542.928, 'learn_time_ms': 4594.605, 'learn_throughput': 870.586, 'update_time_ms': 1.382}, 'info': {'learner': defaultdict(<class 'dict'>, {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.2, 'cur_lr': 5e-05, 'total_loss': 160.01298093795776, 'policy_loss': -0.057975991745479405, 'vf_loss': 160.0666642189026, 'vf_explained_var': 0.06820494, 'kl': 0.021454342699144036, 'entropy': 1.3652280382812023, 'entropy_coeff': 0.0}}}), 'num_steps_sampled': 4000, 'num_agent_steps_sampled': 4000, 'num_steps_trained': 4000}, 'done': False, 'episodes_total': 20, 'training_iteration': 1, 'experiment_id': 'a366d82f001e49159ce3140f542798f6', 'date': '2021-04-30_14-17-30', 'timestamp': 1619817450, 'time_this_iter_s': 5.4820520877838135, 'time_total_s': 5.4820520877838135, 'pid': 78109, 'hostname': 'Michaels-MacBook-Pro.local', 'node_ip': '192.168.4.37', 'config': {'num_workers': 2, 'num_envs_per_worker': 1, 'create_env_on_driver': True, 'rollout_fragment_length': 200, 'batch_mode': 'truncate_episodes', 'train_batch_size': 4000, 'model': {'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'num_framestacks': 'auto', 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1, 'framestack': True}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env': 'MultiAgentArena', 'env_config': {'config': {'width': 10, 'height': 10}}, 'render_env': False, 'record_env': False, 'normalize_actions': False, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'lr': 5e-05, 'log_level': 'WARN', 'callbacks': <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>, 'ignore_worker_failures': False, 'log_sys_usage': True, 'fake_sampler': False, 'framework': 'torch', 'eager_tracing': False, 'explore': True, 'exploration_config': {'type': 'StochasticSampling'}, 'evaluation_interval': None, 'evaluation_num_episodes': 10, 'in_evaluation': False, 'evaluation_config': {}, 'evaluation_num_workers': 0, 'custom_eval_function': None, 'sample_async': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'min_iter_time_s': 0, 'timesteps_per_iteration': 0, 'seed': None, 'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, '_fake_gpus': False, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'placement_strategy': 'PACK', 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policies': {}, 'policy_mapping_fn': None, 'policies_to_train': None, 'observation_fn': None, 'replay_mode': 'independent', 'count_steps_by': 'env_steps'}, 'logger_config': None, 'simple_optimizer': True, 'monitor': -1, 'use_critic': True, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'shuffle_sequences': True, 'num_sgd_iter': 30, 'lr_schedule': None, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'entropy_coeff_schedule': None, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'vf_share_layers': -1}, 'time_since_restore': 5.4820520877838135, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'perf': {'cpu_util_percent': 5.444285714285715, 'ram_util_percent': 58.8209523809524}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(rllib_trainer.train())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9f60284a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iteration 0: R=-65.5725000000001\n",
+      "iteration 1: R=-64.43500000000009\n",
+      "iteration 2: R=-63.22875000000008\n",
+      "iteration 3: R=-63.20100000000011\n",
+      "iteration 4: R=-60.02100000000008\n",
+      "iteration 5: R=-61.05000000000008\n",
+      "iteration 6: R=-60.21000000000007\n",
+      "iteration 7: R=-59.60100000000008\n",
+      "iteration 8: R=-58.098000000000084\n",
+      "iteration 9: R=-57.78600000000008\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Run `train()` n times. Try to repeatedly call this to see rewards increase.\n",
+    "# Move on once you see episode rewards > -55.0.\n",
+    "for i in range(10):\n",
+    "    results = rllib_trainer.train()\n",
+    "    print(f\"iteration {i}: R={results['episode_reward_mean']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "4bde386d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Policy: <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x7f8e7a48d8b0>; Observation-space: Box(-1.0, 1.0, (200,), float32); Action-space: Discrete(4)\n",
+      "Model: FullyConnectedNetwork(\n",
+      "  (_logits): SlimFC(\n",
+      "    (_model): Sequential(\n",
+      "      (0): Linear(in_features=256, out_features=4, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      "  (_hidden_layers): Sequential(\n",
+      "    (0): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=200, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "    (1): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=256, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (_value_branch_separate): Sequential(\n",
+      "    (0): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=200, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "    (1): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=256, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (_value_branch): SlimFC(\n",
+      "    (_model): Sequential(\n",
+      "      (0): Linear(in_features=256, out_features=1, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      ")\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'numpy.ndarray' object has no attribute 'float'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m-------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                          Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-10-f30e3f21d95b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;31m# Generate the Model's output.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_out\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"obs\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msingle_obs\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;31m# tf1.x (static graph) -> Need to run this through a tf session.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/rllib/models/modelv2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_dict, state, seq_lens)\u001b[0m\n\u001b[1;32m    232\u001b[0m             \u001b[0mrestored\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs_flat\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    233\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m             \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrestored\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq_lens\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    235\u001b[0m         if ((not isinstance(res, list) and not isinstance(res, tuple))\n\u001b[1;32m    236\u001b[0m                 or len(res) != 2):\n",
+      "\u001b[0;32m~/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/rllib/models/torch/fcnet.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_dict, state, seq_lens)\u001b[0m\n\u001b[1;32m    121\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mTensorType\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    122\u001b[0m                 seq_lens: TensorType) -> (TensorType, List[TensorType]):\n\u001b[0;32m--> 123\u001b[0;31m         \u001b[0mobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs_flat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    124\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_last_flat_in\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    125\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hidden_layers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_last_flat_in\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'float'"
+     ]
+    }
+   ],
+   "source": [
+    "# !LIVE CODING!\n",
+    "# Let's actually \"look inside\" our Trainer to see what's in there.\n",
+    "pol = rllib_trainer.get_policy()\n",
+    "print(f\"Policy: {pol}; Observation-space: {pol.observation_space}; Action-space: {pol.action_space}\")\n",
+    "\n",
+    "print(f\"Model: {pol.model}\")\n",
+    "\n",
+    "# Create a fake numpy B=1 (single) observation consisting of both agents positions (\"one-hot'd\" and \"concat'd\").\n",
+    "from ray.rllib.utils.numpy import one_hot\n",
+    "single_obs = np.concatenate([one_hot(0, depth=100), one_hot(99, depth=100)])\n",
+    "single_obs = np.array([single_obs])\n",
+    "#single_obs.shape\n",
+    "\n",
+    "# Generate the Model's output.\n",
+    "out, state_out = pol.model({\"obs\": single_obs})\n",
+    "\n",
+    "# tf1.x (static graph) -> Need to run this through a tf session.\n",
+    "numpy_out = pol._sess.run(out)\n",
+    "\n",
+    "# RLlib then passes the model's output to the policy's \"action distribution\" to sample an action.\n",
+    "action_dist = pol.dist_class(out)\n",
+    "action = action_dist.sample()\n",
+    "\n",
+    "# Show us the actual action.\n",
+    "pol._sess.run(action)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a2bb4b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save our trainer.\n",
+    "checkpoint_path = rllib_trainer.save()\n",
+    "print(f\"Trainer was saved in '{checkpoint_path}'!\")\n",
+    "\n",
+    "import os\n",
+    "os.listdir(os.path.dirname(checkpoint_path))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8922c08f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Pretend, we wanted to pick up training from a previous run:\n",
+    "new_trainer = PPOTrainer(config=config)\n",
+    "# Evaluate the new trainer (this should yield random results).\n",
+    "results = new_trainer._evaluate()\n",
+    "print(f\"Evaluating new trainer: R={results['evaluation']['episode_reward_mean']}\")\n",
+    "\n",
+    "# Restoring the trained state into the `new_trainer` object.\n",
+    "new_trainer.restore(checkpoint_path)\n",
+    "\n",
+    "# Evaluate again (this should yield results we saw after having trained our saved agent).\n",
+    "results = new_trainer._evaluate()\n",
+    "print(f\"Evaluating restored trainer: R={results['evaluation']['episode_reward_mean']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27d6427a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 5) Configuration dicts and Ray Tune.\n",
+    "# Where are the default configuration dicts stored?\n",
+    "import pprint\n",
+    "from ray.rllib.agents.ppo import DEFAULT_CONFIG as PPO_DEFAULT_CONFIG\n",
+    "print(f\"PPO's default config is:\")\n",
+    "pprint.pprint(PPO_DEFAULT_CONFIG)\n",
+    "\n",
+    "#from ray.rllib.agents.dqn import DEFAULT_CONFIG as DQN_DEFAULT_CONFIG\n",
+    "#print(f\"DQN's default config is:\")\n",
+    "#pprint.pprint(DQN_DEFAULT_CONFIG)\n",
+    "\n",
+    "#from ray.rllib.agents.trainer import COMMON_CONFIG\n",
+    "#print(f\"RLlib Trainer's default config is:\")\n",
+    "#pprint.pprint(COMMON_CONFIG)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de800d69",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Plugging in Ray Tune.\n",
+    "# Note that this is the recommended way to run any experiments with RLlib.\n",
+    "# Reasons:\n",
+    "# - Tune allows you to do hyperparameter tuning in a user-friendly way\n",
+    "#   and at large scale!\n",
+    "# - Tune automatically allocates needed resources for the different\n",
+    "#   hyperparam trials and experiment runs.\n",
+    "\n",
+    "from ray import tune\n",
+    "\n",
+    "# Now that we will run things \"automatically\" through tune, we have to\n",
+    "# define one or more stopping criteria.\n",
+    "stop = {\n",
+    "    # explain that keys here can be anything present in the above print(trainer.train())\n",
+    "    \"training_iteration\": 5,\n",
+    "    \"episode_reward_mean\": 9999.9,\n",
+    "}\n",
+    "\n",
+    "# \"PPO\" is a registered name that points to RLlib's PPOTrainer.\n",
+    "# See `ray/rllib/agents/registry.py`\n",
+    "# Run our simple experiment until one of the stop criteria is met.\n",
+    "tune.run(\"PPO\", config=config, stop=stop)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b500e55",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Updating an algo's default config dict and adding hyperparameter tuning\n",
+    "# options to it.\n",
+    "# Note: Hyperparameter tuning options (e.g. grid_search) will only work,\n",
+    "# if we run these configs via `tune.run`.\n",
+    "config.update(\n",
+    "    {\n",
+    "        # Try 2 different learning rates.\n",
+    "        \"lr\": tune.grid_search([0.0001, 0.5]),\n",
+    "        # NN model config to tweak the default model\n",
+    "        # that'll be created by RLlib for the policy.\n",
+    "        \"model\": {\n",
+    "            # e.g. change the dense layer stack.\n",
+    "            \"fcnet_hiddens\": [256, 256, 256],\n",
+    "            # Alternatively, you can specify a custom model here\n",
+    "            # (we'll cover that later).\n",
+    "            # \"custom_model\": ...\n",
+    "            # Pass kwargs to your custom model.\n",
+    "            # \"custom_model_config\": {}\n",
+    "        },\n",
+    "    }\n",
+    ")\n",
+    "# Repeat our experiment using tune's grid-search feature.\n",
+    "results = tune.run(\n",
+    "    \"PPO\",\n",
+    "    config=config,\n",
+    "    stop=stop,\n",
+    "    checkpoint_at_end=True,  # create a checkpoint when done.\n",
+    "    checkpoint_freq=1,  # create a checkpoint on every iteration.\n",
+    ")\n",
+    "print(results)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c98740db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 6) Going multi-policy: Our experiment is ill-configured b/c both\n",
+    "# agents, which should behave differently due to their different\n",
+    "# tasks and reward functions, learn the same policy (the \"default_policy\",\n",
+    "# which RLlib always provides if you don't configure anything else; Remember\n",
+    "# that RLlib does not know at Trainer setup time, how many and which agents\n",
+    "# the environment will \"produce\").\n",
+    "# Let's fix this and introduce the \"multiagent\" API.\n",
+    "\n",
+    "# 6.1.) Define an agent->policy mapping function.\n",
+    "# Which agents (defined by the environment) use which policies\n",
+    "# (defined by us)? Mapping is M (agents) -> N (policies), where M >= N.\n",
+    "def policy_mapping_fn(agent: str):\n",
+    "    assert agent in [\"agent1\", \"agent2\"], f\"ERROR: invalid agent {agent}!\"\n",
+    "    return \"pol1\" if agent == \"agent1\" else \"pol2\"\n",
+    "    \n",
+    "# 6.2.) Define details for our two policies.\n",
+    "#TODO: coding Sven: Make it possible to not need obs/action spaces\n",
+    "#  if they are the default anyways.\n",
+    "observation_space = rllib_trainer.workers.local_worker().env.observation_space\n",
+    "action_space = rllib_trainer.workers.local_worker().env.action_space\n",
+    "# Btw, the above is equivalent to saying:\n",
+    "# >>> rllib_trainer.get_policy(\"default_policy\").obs/action_space\n",
+    "policies = {\n",
+    "    \"pol1\": (None, observation_space, action_space, {\"lr\": 0.0003}),\n",
+    "    \"pol2\": (None, observation_space, action_space, {\"lr\": 0.0004}),\n",
+    "}\n",
+    "\n",
+    "#policies_to_train = [\"pol1\", \"pol2\"]\n",
+    "\n",
+    "# 6.3) Adding the above to our config.\n",
+    "config.update({\n",
+    "    \"multiagent\": {\n",
+    "        \"policies\": policies,\n",
+    "        \"policy_mapping_fn\": policy_mapping_fn,\n",
+    "        #\"policies_to_train\": policies_to_train,\n",
+    "    },\n",
+    "})\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4aaa0af6",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Exercise No 2\n",
+    "\n",
+    "<hr />\n",
+    "\n",
+    "Try learning our environment using Ray tune.run and a simple hyperparameter grid_search over:\n",
+    "- 2 different learning rates (pick your own values).\n",
+    "- AND 2 different `train_batch_size` settings (use 2000 and 3000).\n",
+    "\n",
+    "Also, make RLlib use a [128,1282] dense layer stack as the NN model.\n",
+    "\n",
+    "Also, use the config setting of `num_envs_per_worker=10` to increase the sampling throughput.\n",
+    "\n",
+    "In case your local machine has less than 12 CPUs, try setting `num_workers=1` to make all tune trials run at the same time.\n",
+    "Background: PPO by default uses 2 workers, which makes 1 trial use 3 CPUs (2 workers + \"driver\" (\"local-worker\")),\n",
+    "which makes the entire experiment use 12 CPUs. Tune will run trials in sequence in case it cannot allocate enough CPUs at once\n",
+    "(which is also fine, but then takes longer).\n",
+    "\n",
+    "Try to reach a total reward (sum of agent1 and agent2) of -25.0.\n",
+    "\n",
+    "**Good luck! :)**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65e24797",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Solution to Exercise #2:\n",
+    "\n",
+    "# Update our config and set it up for 2x tune grid-searches (leading to 4 parallel trials in total).\n",
+    "config.update({\n",
+    "    \"lr\": tune.grid_search([0.0001, 0.0005]),\n",
+    "    \"train_batch_size\": tune.grid_search([2000, 3000]),\n",
+    "    \"num_envs_per_worker\": 10,\n",
+    "    # Change our model to be simpler.\n",
+    "    \"model\": {\n",
+    "        \"fcnet_hiddens\": [128, 128],\n",
+    "    },\n",
+    "})\n",
+    "\n",
+    "# Run the experiment.\n",
+    "tune.run(\"PPO\", config=config, stop={\"episode_reward_mean\": -25.0, \"training_iteration\": 100})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec93874b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8) Infinite laptop:\n",
+    "\n",
+    "# NOTE: The following cell will only work if you are already on-boarded to our Anyscale Inc. \"Infinite Laptop\".\n",
+    "# To get more information, see https://www.anyscale.com/product\n",
+    "\n",
+    "# Let's quickly divert from our MultiAgentArena and move to something much heavier in terms of environment/simulator complexity.\n",
+    "# We will now demonstrate, how you can use Anyscale's infinite laptop to launch an RLlib experiment on a cloud 4 GPU + 32 CPU machine\n",
+    "# all from within this Jupyter cell here.\n",
+    "# Start an experiment in the cloud using Anyscale's product, RLlib, and a more complex multi-agent env.\n",
+    "\n",
+    "# NOTE \n",
+    "import anyscale\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f2073cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 9) Custom Neural Network Models.\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "\n",
+    "class MyModel(tf.keras.Model):\n",
+    "    def __init__(self,\n",
+    "                input_space,\n",
+    "                action_space,\n",
+    "                num_outputs,\n",
+    "                name=\"\",\n",
+    "                *,\n",
+    "                layers = (256, 256)):\n",
+    "        super().__init__(name=name)\n",
+    "\n",
+    "        self.dense_layers = []\n",
+    "        for i, layer_size in enumerate(layers):\n",
+    "            self.dense_layers.append(tf.keras.layers.Dense(\n",
+    "                layer_size, activation=tf.nn.relu, name=f\"dense_{i}\"))\n",
+    "\n",
+    "        self.logits = tf.keras.layers.Dense(\n",
+    "            num_outputs,\n",
+    "            activation=tf.keras.activations.linear,\n",
+    "            name=\"logits\")\n",
+    "        self.values = tf.keras.layers.Dense(\n",
+    "            1, activation=None, name=\"values\")\n",
+    "\n",
+    "    def call(self, inputs, training=None, mask=None):\n",
+    "        # Standardized input args:\n",
+    "        # - input_dict (RLlib `SampleBatch` object, which is basically a dict with numpy arrays\n",
+    "        # in it)\n",
+    "        out = inputs[\"obs\"]\n",
+    "        for l in self.dense_layers:\n",
+    "            out = l(out)\n",
+    "        logits = self.logits(out)\n",
+    "        values = self.values(out)\n",
+    "\n",
+    "        # Standardized output:\n",
+    "        # - \"normal\" model output tensor (e.g. action logits).\n",
+    "        # - list of internal state outputs (only needed for RNN-/memory enhanced models).\n",
+    "        # - \"extra outs\", such as model's side branches, e.g. value function outputs.\n",
+    "        return logits, [], {\"vf_preds\": tf.reshape(values, [-1])}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17080ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Do a quick test on the custom model class.\n",
+    "from gym.spaces import Box\n",
+    "test_model = MyModel(\n",
+    "    input_space=Box(-1.0, 1.0, (2, )),\n",
+    "    action_space=None,\n",
+    "    num_outputs=2,\n",
+    ")\n",
+    "test_model({\"obs\": np.array([[0.5, 0.5]])})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bfd24b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up our custom model and re-run the experiment.\n",
+    "\n",
+    "config.update({\n",
+    "    \"model\": {\n",
+    "        \"custom_model\": MyModel,\n",
+    "        \"custom_model_config\": {\n",
+    "            \"layers\": [128, 128],\n",
+    "        },\n",
+    "    },\n",
+    "    # Revert these to single trials (and use those hyperparams that performed well in our Exercise #2).\n",
+    "    \"lr\": 0.0005,\n",
+    "    \"train_batch_size\": 2000,\n",
+    "})\n",
+    "\n",
+    "tune.run(\"PPO\", config=config, stop=stop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3b3474e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# \"Hacking in\": How do we customize our RL loop?\n",
+    "# RLlib offers a callbacks API that allows you to add custom behavior at\n",
+    "# all major events during the environment sampling and learning process.\n",
+    "\n",
+    "# Our problem: So far, we can only see the total reward (sum for both agents).\n",
+    "# This does not give us enough insights into the question of which agent\n",
+    "# learns what (maybe agent2 doesn't learn anything and the reward we are observing\n",
+    "# is mostly due to agent1's progress in covering the map!).\n",
+    "# The following custom callbacks class allows us to add each agents single reward to\n",
+    "# the returned metrics, which will then be displayed in tensorboard.\n",
+    "\n",
+    "# We will override RLlib's DefaultCallbacks class and implement the\n",
+    "# `on_episode_step` and `on_episode_end` methods therein.\n",
+    "\n",
+    "from ray.rllib.agents.callbacks import DefaultCallbacks\n",
+    "\n",
+    "\n",
+    "class MyCallbacks(DefaultCallbacks):\n",
+    "    def on_episode_start(self, *, worker, base_env,\n",
+    "                         policies, episode,\n",
+    "                         env_index, **kwargs):\n",
+    "        episode.user_data[\"agent1_rewards\"] = []\n",
+    "        episode.user_data[\"agent2_rewards\"] = []\n",
+    "\n",
+    "    def on_episode_step(self, *, worker, base_env,\n",
+    "                        episode, env_index, **kwargs):\n",
+    "        # Make sure this episode is ongoing.\n",
+    "        #assert episode.length > 0, \\\n",
+    "        #    \"ERROR: `on_episode_step()` callback should not be called right \" \\\n",
+    "        #    \"after env reset!\"\n",
+    "        ag1_r = episode.prev_reward_for(\"agent1\")\n",
+    "        ag2_r = episode.prev_reward_for(\"agent2\")\n",
+    "        #print(\"ag1_r={} ag2_r={}\".format(ag1_r, ag2_r))\n",
+    "        episode.user_data[\"agent1_rewards\"].append(ag1_r)\n",
+    "        episode.user_data[\"agent2_rewards\"].append(ag2_r)\n",
+    "\n",
+    "    def on_episode_end(self, *, worker, base_env,\n",
+    "                       policies, episode,\n",
+    "                       env_index, **kwargs):\n",
+    "        episode.custom_metrics[\"ag1_R\"] = sum(episode.user_data[\"agent1_rewards\"])\n",
+    "        episode.custom_metrics[\"ag2_R\"] = sum(episode.user_data[\"agent2_rewards\"])\n",
+    "        episode.hist_data[\"agent1_rewards\"] = episode.user_data[\"agent1_rewards\"]\n",
+    "        episode.hist_data[\"agent2_rewards\"] = episode.user_data[\"agent2_rewards\"]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2057c507",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setting up our config to point to our new custom callbacks class:\n",
+    "config.update({\n",
+    "    \"env\": MultiAgentArena,  # force \"reload\"\n",
+    "    \"callbacks\": MyCallbacks,  # by default, this would point to `rllib.agents.callbacks.DefaultCallbacks`, which does nothing.\n",
+    "    #TODO: remove this once native keras models are supported!\n",
+    "    \"model\": {\n",
+    "        \"custom_model\": None,\n",
+    "    },\n",
+    "})\n",
+    "\n",
+    "results = tune.run(\"PPO\", config=config, stop={\"training_iteration\": 10})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21d88451",
+   "metadata": {},
+   "source": [
+    "### Let's check tensorboard for the new custom metrics!\n",
+    "\n",
+    "1. Head over to ~/ray_results/PPO/PPO_MultiAgentArena_[some key]_00000_0_[date]_[time]/\n",
+    "1. In that directory, you should see a `event.out....` file.\n",
+    "1. Run `tensorboard --logdir .` and head to https://localhost:6006\n",
+    "\n",
+    "<img src=\"images/tensorboard.png\" width=800>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1aa1e828",
+   "metadata": {},
+   "source": [
+    "## Exercise No 3\n",
+    "\n",
+    "<hr />\n",
+    "\n",
+    "Assume we would like to know exactly how much (new) ground agent1 \n",
+    "covers on average in an episode.\n",
+    "Write your own custom callback class (sub-class\n",
+    "ray.rllib.agents.callback::DefaultCallbacks) and override one or more methods\n",
+    "therein to collect the following data:\n",
+    "The number of (unique) fields agent1 has covered in an episode.\n",
+    "\n",
+    "Run a simple experiment using tune.run (and your custom callbacks class)\n",
+    "and confirm the new metric shows up in tensorboard.\n",
+    "\n",
+    "**Good luck! :)**\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8dd5ab66",
+   "metadata": {},
+   "source": [
+    "### A closer look at RLlib's APIs and structure\n",
+    "\n",
+    "We already took a quick look inside an RLlib Trainer object and extracted its Policy(ies) and the Policy's model (neural network). Here is a much more detailed overview of what's inside a Trainer object.\n",
+    "\n",
+    "At the core is the so-called `WorkerSet` sitting under `Trainer.workers`. A WorkerSet is a group of `RolloutWorker` (`rllib.evaluation.rollout_worker.py`) objects that always consists of a \"local worker\" (`Trainer.workers.local_worker()`) and n \"remote workers\" (`Trainer.workers.remote_workers()`).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b423770",
+   "metadata": {},
+   "source": [
+    "<img src=\"images/rllib_structure.png\" width=1000>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bbcbf80",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ray_summit_2021/tutorial_notebook.ipynb b/ray_summit_2021/tutorial_notebook.ipynb
index a44d799..c35e374 100644
--- a/ray_summit_2021/tutorial_notebook.ipynb
+++ b/ray_summit_2021/tutorial_notebook.ipynb
@@ -26,22 +26,38 @@
     "\n",
     "Then ...\n",
     "\n",
-    "#### Quick `conda` setup instructions (Mac and Linux):\n",
+    "#### Quick `conda` setup instructions (Linux):\n",
+    "\n",
     "```\n",
     "$ conda create -n rllib python=3.8\n",
     "$ conda activate rllib\n",
-    "$ pip install ray[rllib]\n",
-    "$ pip install [tensorflow|torch]  # <- either one works!\n",
-    "$ pip install jupyter-labs\n",
+    "$ pip install \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
+    "```\n",
+    "\n",
+    "#### Quick `conda` setup instructions (Mac):\n",
+    "\n",
+    "cmake https://github.com/actions/setup-python/issues/121\n",
+    "\n",
+    "```\n",
+    "$ conda create -n rllib python=3.8\n",
+    "$ conda activate rllib\n",
+    "$ pip install cmake \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
     "```\n",
     "\n",
     "#### Quick `conda` setup instructions (Win10):\n",
     "```\n",
     "$ conda create -n rllib python=3.8\n",
     "$ conda activate rllib\n",
-    "$ pip install ray[rllib]\n",
-    "$ pip install [tensorflow|torch]  # <- either one works!\n",
-    "$ pip install jupyter-labs\n",
+    "$ pip install \"ray[rllib]\"\n",
+    "$ pip install tensorflow # <- either one works!\n",
+    "$ pip install torch  # <- either one works!\n",
+    "$ pip install jupyterlab\n",
     "$ conda install pywin32\n",
     "```\n",
     "\n",
@@ -89,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 1,
    "id": "fuzzy-career",
    "metadata": {},
    "outputs": [
@@ -97,24 +113,26 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-04-30 15:12:26,105\tINFO services.py:1267 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+      "/Users/michaelgalarnyk/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/autoscaler/_private/cli_logger.py:57: FutureWarning: Not all Ray CLI dependencies were found. In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will only be usable via `pip install 'ray[default]'`. Please update your install command.\n",
+      "  warnings.warn(\n",
+      "2021-04-30 14:36:56,166\tINFO services.py:1267 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'node_ip_address': '192.168.0.178',\n",
-       " 'raylet_ip_address': '192.168.0.178',\n",
-       " 'redis_address': '192.168.0.178:6379',\n",
-       " 'object_store_address': '/tmp/ray/session_2021-04-30_15-12-24_090698_27624/sockets/plasma_store',\n",
-       " 'raylet_socket_name': '/tmp/ray/session_2021-04-30_15-12-24_090698_27624/sockets/raylet',\n",
+       "{'node_ip_address': '192.168.4.37',\n",
+       " 'raylet_ip_address': '192.168.4.37',\n",
+       " 'redis_address': '192.168.4.37:6379',\n",
+       " 'object_store_address': '/tmp/ray/session_2021-04-30_14-36-55_037716_80917/sockets/plasma_store',\n",
+       " 'raylet_socket_name': '/tmp/ray/session_2021-04-30_14-36-55_037716_80917/sockets/raylet',\n",
        " 'webui_url': '127.0.0.1:8265',\n",
-       " 'session_dir': '/tmp/ray/session_2021-04-30_15-12-24_090698_27624',\n",
-       " 'metrics_export_port': 61273,\n",
-       " 'node_id': 'd04d14c849929d8e81f0c078e1f57c3aa00fc08908b7e8cff66c6e2e'}"
+       " 'session_dir': '/tmp/ray/session_2021-04-30_14-36-55_037716_80917',\n",
+       " 'metrics_export_port': 61625,\n",
+       " 'node_id': 'd5c2958b1888981c90c2ddd19d94a32059cc10b1d090ccc471e41618'}"
       ]
      },
-     "execution_count": 45,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -132,7 +150,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "proud-yorkshire",
    "metadata": {},
@@ -153,7 +170,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "1eb35116-efda-4799-8bae-e96d7775a0d1",
    "metadata": {},
@@ -175,11 +191,12 @@
     "Spaces are usually defined by their shape (e.g. 84x84x3 RGB images) and datatype (e.g. uint8 for RGB values between 0 and 255).\n",
     "However, spaces could also be composed of other spaces (see Tuple or Dict spaces) or could be simply discrete with n fixed possible values\n",
     "(represented by integers). For example, in our game, where each agent can only go up/down/left/right, the action space would be \"Discrete(4)\"\n",
-    "(no datatype, no shape needs to be defined here)."
+    "(no datatype, no shape needs to be defined here).\n",
+    "\n",
+    "<b>Sven:Say something like \"the code below that will create our space (or whatever you think best, just so people know what the code is explicitly for). It can literally be one sentence.</b> "
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "023e4135-98ed-4e65-9e26-66f340747529",
    "metadata": {},
@@ -189,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "871a3661-2d74-4a50-b4ef-a89c27d978f3",
    "metadata": {},
    "outputs": [],
@@ -324,10 +341,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "spatial-geography",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'agent1': 0, 'agent2': 1}\n",
+      "{'agent1': 2, 'agent2': 1}\n",
+      "{'agent1': 3, 'agent2': 0}\n"
+     ]
+    }
+   ],
    "source": [
     "class DummyTrainer:\n",
     "    \"\"\"Dummy Trainer class used in Exercise #1.\n",
@@ -349,12 +376,39 @@
     "    print(dummy_trainer.compute_action({\"agent1\": np.array([0, 10]), \"agent2\": np.array([10, 0])}))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "31b314a1-fc8b-4dd2-8b49-e8ea88aaedd9",
+   "metadata": {},
+   "source": [
+    "<b>Sven: As I am sure you know, you will need to explain why env.render is not super reliable or problems people can encounter depending on their operating system. You will however definitely need to run the optional code before commenting it out yourself.</b>\n",
+    "\n",
+    "simple_image_viewer.imshow might not work on your system due to x or y. "
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "liable-district",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Episode done. R1=-61.0 R2=-16.69999999999997\n",
+      "Episode done. R1=-85.5 R2=-15.599999999999964\n",
+      "Episode done. R1=-51.0 R2=-18.899999999999963\n",
+      "Episode done. R1=-54.0 R2=-15.599999999999962\n",
+      "Episode done. R1=-42.0 R2=-18.899999999999963\n",
+      "Episode done. R1=-76.0 R2=-19.99999999999996\n",
+      "Episode done. R1=-42.5 R2=-17.79999999999996\n",
+      "Episode done. R1=-25.5 R2=-18.899999999999963\n",
+      "Episode done. R1=-53.5 R2=-19.99999999999996\n",
+      "Episode done. R1=-51.0 R2=-18.899999999999963\n"
+     ]
+    }
+   ],
    "source": [
     "# Solution to Exercise #1:\n",
     "#from gym.envs.classic_control.rendering import SimpleImageViewer\n",
@@ -382,12 +436,28 @@
     "    #simple_image_viewer.imshow(img)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "95f756ab-c3f7-4da9-b5a1-a406c473baec",
+   "metadata": {},
+   "source": [
+    "<b>Sven: Say something like \"RLlib supports TensorFlow and PyTorch. In the code below, you will get an error if you only have PyTorch installed. You can get rid of the error by uncommenting the framework line.\"</b>"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "gentle-reliance",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-04-30 14:37:22,802\tWARNING util.py:53 -- Install gputil for GPU system monitoring.\n"
+     ]
+    }
+   ],
    "source": [
     "# 4) Plugging in RLlib.\n",
     "\n",
@@ -406,7 +476,7 @@
     "            \"height\": 10,\n",
     "        },\n",
     "    },\n",
-    "    # \"framework\": \"torch\",\n",
+    "    \"framework\": \"torch\",\n",
     "    \"create_env_on_driver\": True,\n",
     "}\n",
     "# Instantiate the Trainer object using above config.\n",
@@ -415,10 +485,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "spectacular-guard",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-04-30 14:37:23,747\tWARNING deprecation.py:33 -- DeprecationWarning: `SampleBatch.data[..]` has been deprecated. Use `SampleBatch[..]` instead. This will raise an error in the future!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'episode_reward_max': -42.30000000000002, 'episode_reward_min': -88.50000000000016, 'episode_reward_mean': -66.3750000000001, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-67.50000000000009, -51.00000000000005, -67.50000000000016, -54.000000000000156, -78.0000000000001, -88.50000000000016, -54.00000000000012, -58.50000000000012, -73.80000000000015, -58.50000000000003, -55.50000000000003, -66.90000000000013, -42.30000000000002, -72.0000000000001, -76.50000000000013, -66.00000000000013, -70.50000000000009, -75.0000000000001, -66.00000000000007, -85.50000000000013], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.12121988938643144, 'mean_inference_ms': 0.7240949691711486, 'mean_action_processing_ms': 0.041630718257877376, 'mean_env_wait_ms': 0.022003819773366285, 'mean_env_render_ms': 0.0}, 'off_policy_estimator': {}, 'num_healthy_workers': 2, 'timesteps_total': 4000, 'agent_timesteps_total': 4000, 'timers': {'sample_time_ms': 939.612, 'sample_throughput': 4257.078, 'learn_time_ms': 5480.429, 'learn_throughput': 729.87, 'update_time_ms': 1.143}, 'info': {'learner': defaultdict(<class 'dict'>, {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.2, 'cur_lr': 5e-05, 'total_loss': 119.01326894760132, 'policy_loss': -0.04648278030799702, 'vf_loss': 119.05559778213501, 'vf_explained_var': 0.07897025, 'kl': 0.02077123534400016, 'entropy': 1.365920566022396, 'entropy_coeff': 0.0}}}), 'num_steps_sampled': 4000, 'num_agent_steps_sampled': 4000, 'num_steps_trained': 4000}, 'done': False, 'episodes_total': 20, 'training_iteration': 1, 'experiment_id': '9ef16921408145559d80c71ad3d6599b', 'date': '2021-04-30_14-37-29', 'timestamp': 1619818649, 'time_this_iter_s': 6.424378156661987, 'time_total_s': 6.424378156661987, 'pid': 80917, 'hostname': 'Michaels-MacBook-Pro.local', 'node_ip': '192.168.4.37', 'config': {'num_workers': 2, 'num_envs_per_worker': 1, 'create_env_on_driver': True, 'rollout_fragment_length': 200, 'batch_mode': 'truncate_episodes', 'train_batch_size': 4000, 'model': {'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'num_framestacks': 'auto', 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1, 'framestack': True}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env': 'MultiAgentArena', 'env_config': {'config': {'width': 10, 'height': 10}}, 'render_env': False, 'record_env': False, 'normalize_actions': False, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'lr': 5e-05, 'log_level': 'WARN', 'callbacks': <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>, 'ignore_worker_failures': False, 'log_sys_usage': True, 'fake_sampler': False, 'framework': 'torch', 'eager_tracing': False, 'explore': True, 'exploration_config': {'type': 'StochasticSampling'}, 'evaluation_interval': None, 'evaluation_num_episodes': 10, 'in_evaluation': False, 'evaluation_config': {}, 'evaluation_num_workers': 0, 'custom_eval_function': None, 'sample_async': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'min_iter_time_s': 0, 'timesteps_per_iteration': 0, 'seed': None, 'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, '_fake_gpus': False, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'placement_strategy': 'PACK', 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policies': {}, 'policy_mapping_fn': None, 'policies_to_train': None, 'observation_fn': None, 'replay_mode': 'independent', 'count_steps_by': 'env_steps'}, 'logger_config': None, 'simple_optimizer': True, 'monitor': -1, 'use_critic': True, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'shuffle_sequences': True, 'num_sgd_iter': 30, 'lr_schedule': None, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'entropy_coeff_schedule': None, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'vf_share_layers': -1}, 'time_since_restore': 6.424378156661987, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'perf': {'cpu_util_percent': 10.96, 'ram_util_percent': 63.20000000000001}}\n"
+     ]
+    }
+   ],
    "source": [
     "# That's it, we are ready to train.\n",
     "# Calling `train` once runs a single \"training iteration\". One iteration\n",
@@ -431,10 +516,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "1ed7b0ca-6981-44ae-9656-18566ed14a1d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iteration 0: R=-66.4275000000001\n",
+      "iteration 1: R=-65.79500000000009\n",
+      "iteration 2: R=-64.9912500000001\n",
+      "iteration 3: R=-64.05600000000008\n",
+      "iteration 4: R=-62.36400000000007\n",
+      "iteration 5: R=-61.95000000000008\n",
+      "iteration 6: R=-61.82700000000008\n",
+      "iteration 7: R=-61.56900000000007\n",
+      "iteration 8: R=-60.93600000000008\n",
+      "iteration 9: R=-61.419000000000075\n"
+     ]
+    }
+   ],
    "source": [
     "# Run `train()` n times. Try to repeatedly call this to see rewards increase.\n",
     "# Move on once you see episode rewards > -55.0.\n",
@@ -445,10 +547,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "aa7e0d09-e4fa-4657-b602-aa9d6750a33e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Policy: <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x7f96a187d2e0>; Observation-space: Box(-1.0, 1.0, (200,), float32); Action-space: Discrete(4)\n",
+      "Model: FullyConnectedNetwork(\n",
+      "  (_logits): SlimFC(\n",
+      "    (_model): Sequential(\n",
+      "      (0): Linear(in_features=256, out_features=4, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      "  (_hidden_layers): Sequential(\n",
+      "    (0): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=200, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "    (1): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=256, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (_value_branch_separate): Sequential(\n",
+      "    (0): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=200, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "    (1): SlimFC(\n",
+      "      (_model): Sequential(\n",
+      "        (0): Linear(in_features=256, out_features=256, bias=True)\n",
+      "        (1): Tanh()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (_value_branch): SlimFC(\n",
+      "    (_model): Sequential(\n",
+      "      (0): Linear(in_features=256, out_features=1, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      ")\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'numpy.ndarray' object has no attribute 'float'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m-----------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0mTraceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-9-f30e3f21d95b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;31m# Generate the Model's output.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate_out\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"obs\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msingle_obs\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;31m# tf1.x (static graph) -> Need to run this through a tf session.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/rllib/models/modelv2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_dict, state, seq_lens)\u001b[0m\n\u001b[1;32m    232\u001b[0m             \u001b[0mrestored\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs_flat\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    233\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m             \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrestored\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq_lens\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    235\u001b[0m         if ((not isinstance(res, list) and not isinstance(res, tuple))\n\u001b[1;32m    236\u001b[0m                 or len(res) != 2):\n",
+      "\u001b[0;32m~/anaconda3/envs/rllib/lib/python3.8/site-packages/ray/rllib/models/torch/fcnet.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_dict, state, seq_lens)\u001b[0m\n\u001b[1;32m    121\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mTensorType\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    122\u001b[0m                 seq_lens: TensorType) -> (TensorType, List[TensorType]):\n\u001b[0;32m--> 123\u001b[0;31m         \u001b[0mobs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"obs_flat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    124\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_last_flat_in\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    125\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hidden_layers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_last_flat_in\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'float'"
+     ]
+    }
+   ],
    "source": [
     "# !LIVE CODING!\n",
     "# Let's actually \"look inside\" our Trainer to see what's in there.\n",