Pong Q-Learning¶

Settings¶

In [ ]:
ROOT = r'/content/drive/MyDrive/ml-data/rl/pong-dqn'
MODEL_NAME = r'model'
MODEL_TRAINING = False
    # Train a new model or continue to train the existing model. Skip training if False.

Downloads¶

In [ ]:
# no-verbose, no-clobber (do not download if alredy exists)
!wget -nv -nc "https://drive.google.com/uc?export=download&id=1mK8ZSW0uG2aZ4Ubov2cD9fiLq8HR-ucb" -O Roms.rar
!unrar x Roms.rar roms/ > /dev/null # extract to roms/
!python -m atari_py.import_roms roms > /dev/null
2021-09-06 05:02:00 URL:https://doc-0k-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/fhnef1ullememhjojdm5rh74cule7nj6/1630904475000/09349901208138295014/*/1mK8ZSW0uG2aZ4Ubov2cD9fiLq8HR-ucb?e=download [11128004] -> "Roms.rar" [1]
2021-09-06 05:02:02 URL:https://raw.githubusercontent.com/openai/baselines/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/atari_wrappers.py [9686/9686] -> "openai_baselines/atari_wrappers.py" [1]
2021-09-06 05:02:02 URL:https://raw.githubusercontent.com/openai/baselines/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/wrappers.py [946/946] -> "openai_baselines/wrappers.py" [1]

Imports¶

In [ ]:
import time
import random
import pathlib

import numpy as np

import matplotlib.pyplot as plt

import gym
import gym.spaces
from gym.wrappers import Monitor

import tensorflow as tf

Gym Environment¶

In [ ]:
ENV_NAME = 'PongNoFrameskip-v4'

# Smoke test
gym.make(ENV_NAME)
Out[ ]:
<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>

Обертки:

  1. MaxAndSkipEnv (Wrapper) -- меняет метод step. Метод step:
    1. Возвращает каждый 4-й кадр.
    2. Смешивает (maxpool) два последних кадра.
  2. PongFrameGrayscaleCrop (ObservationWrapper) -- обрезает кадр до размера 80x80, конвертирует в grayscale.
  3. FrameStack (ObservationWrapper) -- собирает 4 последовательных кадра в одно 4-х канальное изображение и возвращает как состояние state.
  4. ScaledFloatFrame (ObservationWrapper) -- байтовое представление изображения в дробное: [0, 255] → [0, 1]
  5. LeftRightAction (ActionWrapper) -- уменьшает пространство действий до двух: 0 - право, 1 - лево.
In [ ]:
#from openai_baselines.atari_wrappers import MaxAndSkipEnv

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class PongFrameGrayscaleCrop(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)

    def observation(self, obs):
        return obs[34:-16:2, ::2, 2:3]


class FrameStack(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super().__init__(env)
        self.dtype = dtype
        old_space = env.observation_space            # (80, 80, 1)
        self.observation_space = gym.spaces.Box(
            old_space.low.repeat(n_steps, axis=-1),  # (80, 80, 4)
            old_space.high.repeat(n_steps, axis=-1), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[..., :-1] = self.buffer[..., 1:]
        self.buffer[..., -1:] = observation
        return self.buffer  # (80, 80, 4)


class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class LeftRightAction(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.action_space = gym.spaces.Discrete(2)
    
    def action(self, act):
        """ 0 -> RIGHT (2), 1 -> LEFT (3) """
        return [2, 3][act]


def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env, skip=4)
    env = PongFrameGrayscaleCrop(env)
    env = FrameStack(env, 4)
    env = ScaledFloatFrame(env)
    env = LeftRightAction(env)
    return env
In [ ]:
env = make_env('PongNoFrameskip-v4')

s = env.reset()
for i in range(25):
    s = env.step(i%2)[0]

fig, axes = plt.subplots(ncols=s.shape[-1], figsize=(16, 4))
for i, ax in enumerate(axes):
    ax.imshow(s[..., i], cmap='gray')
s.shape

# State after 25 steps
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
  warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
Out[ ]:
(80, 80, 4)

Deep Q-Network¶

In [ ]:
INPUT_SHAPE = (80, 80, 4)
NUM_ACTIONS = 2
In [ ]:
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense

def new_model():
    model = tf.keras.models.Sequential([
        Input(INPUT_SHAPE),
        Conv2D(32, (8,8), strides=4, activation='relu'),
        Conv2D(64, (4,4), strides=2, activation='relu'),
        Conv2D(64, (3,3), strides=1, activation='relu'),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(NUM_ACTIONS),
    ])
    return model

model = new_model()
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 19, 19, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 8, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 6, 6, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 2304)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1180160   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1026      
=================================================================
Total params: 1,259,170
Trainable params: 1,259,170
Non-trainable params: 0
_________________________________________________________________
In [ ]:
# Smoke test
test_state = env.observation_space.sample()
model(test_state[np.newaxis, ...])
Out[ ]:
<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[53.27661  ,  0.9582041]], dtype=float32)>

Experience buffer¶

In [ ]:
class ExperienceBuffer:
    def __init__(self, max_len, state_shape, state_dtype):
        """
        Numpy array based buffer.
        Supported operations: append (push), random sample.
        Unsupported operations: get, pop, insert, remove.

        Stores (state, action, reward, next_state, done) tuples, where
            state0 : ndarray of shape (h, w, c)
            action : int32
            reward : float32
            state1 : ndarray of shape (h, w, c)
            done : bool
        """
        self.states0 = np.zeros((max_len, *state_shape), dtype=state_dtype)
        self.states1 = np.zeros((max_len, *state_shape), dtype=state_dtype)
        self.actions = np.zeros(max_len, dtype=np.int32)
        self.rewards = np.zeros(max_len, dtype=np.float32)
        self.dones = np.zeros(max_len, dtype=np.bool)
        self.max_len = max_len
        self.len = 0
        self.head = 0

    def __len__(self):
        """ Current buffer length. Never exeeds max_len. """
        return self.len

    def append(self, state, action, reward, next_state, done):
        i = self.head % self.max_len
        self.states0[i] = state
        self.actions[i] = action
        self.rewards[i] = reward
        self.dones[i] = done
        self.states1[i] = next_state
        self.head = i + 1
        self.len = min(self.len + 1, self.max_len)

    def sample(self, batch_size):
        idxs = np.random.choice(range(self.len), batch_size, replace=False)
        return (
            self.states0[idxs],
            self.actions[idxs],
            self.rewards[idxs],
            self.dones[idxs],
            self.states1[idxs],
        )

test_state = np.arange(6).reshape(1, 2, 3)
test_buffer = ExperienceBuffer(10, state_shape=(1, 2, 3), state_dtype=np.float32)
assert test_buffer.len == 0
test_buffer.append(test_state[:], 0, 0., test_state[:], False)
assert test_buffer.len == 1
{test_buffer.append(test_state[:], 0, 0., test_state[:], False) for i in range(15)}
assert test_buffer.len == 10
assert [batch.shape for batch in test_buffer.sample(4)] \
    == [(4, 1, 2, 3), (4,), (4,), (4,), (4, 1, 2, 3)]
assert [batch.dtype for batch in test_buffer.sample(1)] \
    == [np.float32, np.int32, np.float32, np.bool, np.float32]

Agent¶

In [ ]:
class Agent:
    """ Class for interactions with environment and replay buffer.
    """
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, model, epsilon=0.0):

        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            Q_s = model(self.state[np.newaxis, ...])[0]  # (NUM_ACTIONS,)
            action = np.argmax(Q_s)                      # (int)

        next_state, reward, done, _ = self.env.step(action)
        self.total_reward += reward

        self.exp_buffer.append(self.state, action, reward, next_state, done)
        self.state = next_state
        if done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

# Smoke test
test_env = make_env(ENV_NAME)

test_buffer = ExperienceBuffer(10, state_shape=INPUT_SHAPE, state_dtype=np.float32)
test_agent = Agent(env, test_buffer)

test_agent.play_step(model)
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
  warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))

Training loop¶

In [ ]:
MEAN_REWARD_BOUND = 19.0

gamma = 0.99
batch_size = 32                 
replay_size = 10000             
learning_rate = 1e-4            
sync_target_frames = 1000        
replay_start_size = 10000

eps_start = 1.0
eps_decay = .999985
eps_min = 0.02
In [ ]:
root_path = pathlib.Path(ROOT)
model_path = root_path / MODEL_NAME

if model_path.exists():
    model = tf.keras.models.load_model(model_path)
else:
    model = new_model()

    optimizer = tf.optimizers.Adam(learning_rate)
    loss = tf.losses.Huber()

    model.compile(optimizer, loss)
In [ ]:
env = make_env(ENV_NAME)
target_model = tf.keras.models.clone_model(model)  # clone architecture (weights not copied)
target_model.set_weights(model.get_weights())      # clone weights

buffer = ExperienceBuffer(replay_size,
                          state_shape=env.observation_space.shape,
                          state_dtype=env.observation_space.dtype)

agent = Agent(env, buffer)

epsilon = eps_start

total_rewards = []
frame_idx = 0
best_mean_reward = None
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
  warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
In [ ]:
if MODEL_TRAINING:
    while True:
        frame_idx += 1
        epsilon = max(epsilon*eps_decay, eps_min)
        reward = agent.play_step(model, epsilon)

        if reward is not None:
            total_rewards.append(reward)
            mean_reward = np.mean(total_rewards[-100:])
            print("%d: %d games, mean reward %.3f, (epsilon %.2f)" % 
                (frame_idx, len(total_rewards), mean_reward, epsilon))
        
            if best_mean_reward is None or best_mean_reward < mean_reward:
                best_mean_reward = mean_reward
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f" % (best_mean_reward))
                try:
                    model.save(model_path)
                except Exception as e:
                    print(f'Unable to save model to {model_path}.', repr(e))

            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < replay_start_size:
            continue

        batch = buffer.sample(batch_size)
        states, actions, rewards, dones, next_states = batch

        # Get quality score for each action in state `s'`
        Q_next = target_model(next_states)          # (batch, NUM_ACTIONS)

        # Select max Q-value for each next state
        next_state_values = np.max(Q_next, axis=1)  # (batch,)
        next_state_values[dones] = 0.0

        expected_state_action_values = rewards + gamma * next_state_values

        with tf.GradientTape() as tape:
            # Get model predictions of Q-values
            out = model(states)  # (batch, NUM_ACTIONS)
            state_action_values = tf.gather_nd(out, actions[:, None], batch_dims=1)  # (batch,)
            loss_value = loss(expected_state_action_values, state_action_values)  # true, pred

        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_idx % sync_target_frames == 0:
            target_model.set_weights(model.get_weights())
Show log (click to open)
1379: 1 games, mean reward -19.000, (epsilon 0.98)
Best mean reward updated -19.000
2325: 2 games, mean reward -20.000, (epsilon 0.97)
3591: 3 games, mean reward -19.000, (epsilon 0.95)
4537: 4 games, mean reward -19.500, (epsilon 0.93)
5483: 5 games, mean reward -19.800, (epsilon 0.92)
6709: 6 games, mean reward -19.833, (epsilon 0.90)
7595: 7 games, mean reward -20.000, (epsilon 0.89)
8541: 8 games, mean reward -20.125, (epsilon 0.88)
9644: 9 games, mean reward -20.111, (epsilon 0.87)
10775: 10 games, mean reward -20.200, (epsilon 0.85)
11723: 11 games, mean reward -20.273, (epsilon 0.84)
12922: 12 games, mean reward -20.167, (epsilon 0.82)
13936: 13 games, mean reward -20.077, (epsilon 0.81)
15134: 14 games, mean reward -20.000, (epsilon 0.80)
16175: 15 games, mean reward -20.000, (epsilon 0.78)
17158: 16 games, mean reward -20.000, (epsilon 0.77)
18317: 17 games, mean reward -20.000, (epsilon 0.76)
19708: 18 games, mean reward -19.833, (epsilon 0.74)
20652: 19 games, mean reward -19.895, (epsilon 0.73)
21814: 20 games, mean reward -19.900, (epsilon 0.72)
23063: 21 games, mean reward -19.952, (epsilon 0.71)
24067: 22 games, mean reward -20.000, (epsilon 0.70)
25388: 23 games, mean reward -19.957, (epsilon 0.68)
26456: 24 games, mean reward -20.000, (epsilon 0.67)
27903: 25 games, mean reward -19.880, (epsilon 0.66)
29232: 26 games, mean reward -19.769, (epsilon 0.65)
30518: 27 games, mean reward -19.778, (epsilon 0.63)
31679: 28 games, mean reward -19.786, (epsilon 0.62)
33396: 29 games, mean reward -19.724, (epsilon 0.61)
34883: 30 games, mean reward -19.767, (epsilon 0.59)
36194: 31 games, mean reward -19.806, (epsilon 0.58)
37380: 32 games, mean reward -19.844, (epsilon 0.57)
39130: 33 games, mean reward -19.758, (epsilon 0.56)
40890: 34 games, mean reward -19.618, (epsilon 0.54)
42678: 35 games, mean reward -19.514, (epsilon 0.53)
44387: 36 games, mean reward -19.528, (epsilon 0.51)
46320: 37 games, mean reward -19.459, (epsilon 0.50)
48096: 38 games, mean reward -19.421, (epsilon 0.49)
49926: 39 games, mean reward -19.256, (epsilon 0.47)
51673: 40 games, mean reward -19.250, (epsilon 0.46)
53114: 41 games, mean reward -19.244, (epsilon 0.45)
54890: 42 games, mean reward -19.214, (epsilon 0.44)
56728: 43 games, mean reward -19.186, (epsilon 0.43)
58424: 44 games, mean reward -19.136, (epsilon 0.42)
60141: 45 games, mean reward -19.111, (epsilon 0.41)
62623: 46 games, mean reward -19.022, (epsilon 0.39)
64128: 47 games, mean reward -19.021, (epsilon 0.38)
66667: 48 games, mean reward -18.979, (epsilon 0.37)
Best mean reward updated -18.979
69179: 49 games, mean reward -18.918, (epsilon 0.35)
Best mean reward updated -18.918
71771: 50 games, mean reward -18.780, (epsilon 0.34)
Best mean reward updated -18.780
75002: 51 games, mean reward -18.627, (epsilon 0.32)
Best mean reward updated -18.627
77848: 52 games, mean reward -18.423, (epsilon 0.31)
Best mean reward updated -18.423
80481: 53 games, mean reward -18.377, (epsilon 0.30)
Best mean reward updated -18.377
83493: 54 games, mean reward -18.222, (epsilon 0.29)
Best mean reward updated -18.222
86431: 55 games, mean reward -18.182, (epsilon 0.27)
Best mean reward updated -18.182
90202: 56 games, mean reward -17.929, (epsilon 0.26)
Best mean reward updated -17.929
93674: 57 games, mean reward -17.807, (epsilon 0.25)
Best mean reward updated -17.807
97405: 58 games, mean reward -17.707, (epsilon 0.23)
Best mean reward updated -17.707
101096: 59 games, mean reward -17.542, (epsilon 0.22)
Best mean reward updated -17.542
104468: 60 games, mean reward -17.450, (epsilon 0.21)
Best mean reward updated -17.450
107665: 61 games, mean reward -17.361, (epsilon 0.20)
Best mean reward updated -17.361
111144: 62 games, mean reward -16.952, (epsilon 0.19)
Best mean reward updated -16.952
115524: 63 games, mean reward -16.746, (epsilon 0.18)
Best mean reward updated -16.746
119284: 64 games, mean reward -16.500, (epsilon 0.17)
Best mean reward updated -16.500
122894: 65 games, mean reward -16.354, (epsilon 0.16)
Best mean reward updated -16.354
127027: 66 games, mean reward -16.167, (epsilon 0.15)
Best mean reward updated -16.167
131446: 67 games, mean reward -15.970, (epsilon 0.14)
Best mean reward updated -15.970
133618: 68 games, mean reward -16.015, (epsilon 0.13)
138033: 69 games, mean reward -15.855, (epsilon 0.13)
Best mean reward updated -15.855
142283: 70 games, mean reward -15.743, (epsilon 0.12)
Best mean reward updated -15.743
147587: 71 games, mean reward -15.507, (epsilon 0.11)
Best mean reward updated -15.507
151038: 72 games, mean reward -15.167, (epsilon 0.10)
Best mean reward updated -15.167
155839: 73 games, mean reward -14.918, (epsilon 0.10)
Best mean reward updated -14.918
158918: 74 games, mean reward -14.500, (epsilon 0.09)
Best mean reward updated -14.500
162679: 75 games, mean reward -14.120, (epsilon 0.09)
Best mean reward updated -14.120
165730: 76 games, mean reward -13.737, (epsilon 0.08)
Best mean reward updated -13.737
168441: 77 games, mean reward -13.714, (epsilon 0.08)
Best mean reward updated -13.714
171174: 78 games, mean reward -13.385, (epsilon 0.08)
Best mean reward updated -13.385
175422: 79 games, mean reward -13.127, (epsilon 0.07)
Best mean reward updated -13.127
179651: 80 games, mean reward -12.887, (epsilon 0.07)
Best mean reward updated -12.887
183152: 81 games, mean reward -12.531, (epsilon 0.06)
Best mean reward updated -12.531
186111: 82 games, mean reward -12.183, (epsilon 0.06)
Best mean reward updated -12.183
190600: 83 games, mean reward -11.952, (epsilon 0.06)
Best mean reward updated -11.952
193937: 84 games, mean reward -11.643, (epsilon 0.05)
Best mean reward updated -11.643
198586: 85 games, mean reward -11.435, (epsilon 0.05)
Best mean reward updated -11.435
201869: 86 games, mean reward -11.163, (epsilon 0.05)
Best mean reward updated -11.163
206503: 87 games, mean reward -11.000, (epsilon 0.05)
Best mean reward updated -11.000
211416: 88 games, mean reward -10.795, (epsilon 0.04)
Best mean reward updated -10.795
214673: 89 games, mean reward -10.528, (epsilon 0.04)
Best mean reward updated -10.528
217600: 90 games, mean reward -10.222, (epsilon 0.04)
Best mean reward updated -10.222
220469: 91 games, mean reward -9.923, (epsilon 0.04)
Best mean reward updated -9.923
223785: 92 games, mean reward -9.674, (epsilon 0.03)
Best mean reward updated -9.674
226832: 93 games, mean reward -9.387, (epsilon 0.03)
Best mean reward updated -9.387
229700: 94 games, mean reward -9.106, (epsilon 0.03)
Best mean reward updated -9.106
...

AI showmatch¶

In [ ]:
if 'vdisplay' not in locals():  # run once
  
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !pip install pyvirtualdisplay > /dev/null 2>&1

    from pyvirtualdisplay import Display
    vdisplay = Display(visible=0, size=(1400, 900))
    vdisplay.start()
In [ ]:
def show_video():
    import pathlib
    import base64
    from IPython.display import HTML
    from IPython import display as ipythondisplay

    pwd = pathlib.Path()
    mp4list = list(pwd.glob('video/*.mp4'))
    if len(mp4list) > 0:
        mp4_path = mp4list[0]
        print(mp4_path)
        video = mp4_path.read_bytes()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
            loop controls style="height: 400px;">
            <source src="data:video/mp4;base64,{0}" type="video/mp4" />
            </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
In [ ]:
model = tf.keras.models.load_model(model_path)

env_sim = Monitor(env, './video', force=True) 
s = env_sim.reset()

totalReward = 0

for _ in range(2000):
    env_sim.render()
    
    a = np.argmax(model(s[None, ...])[0])
    s, r, done, _ = env_sim.step(a)

    totalReward += r
    if r != 0:
        print('New reward = {}'.format(r))
    if done:        
        break
      
env_sim.close()

print('Total reward = {}'.format(totalReward))
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = -1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
New reward = 1.0
Total reward = 17.0
In [ ]:
show_video()
video/openaigym.video.6.61.video000000.mp4

References¶

Based on articles (2020-08-15):

  1. Deep Q-Network (DQN)-I
  2. Deep Q-Network (DQN)-II

Notebook with full code (pytorch):

  1. github
    • https://github.com/jorditorresBCN/Deep-Reinforcement-Learning-Explained/blob/master/DRL_15_16_17_DQN_Pong.ipynb
  2. colab
    • https://colab.research.google.com/github/jorditorresBCN/Deep-Reinforcement-Learning-Explained/blob/master/DRL_15_16_17_DQN_Pong.ipynb