ROOT = r'/content/drive/MyDrive/ml-data/rl/pong-dqn'
MODEL_NAME = r'model'
MODEL_TRAINING = False
# Train a new model or continue to train the existing model. Skip training if False.
# no-verbose, no-clobber (do not download if alredy exists)
!wget -nv -nc "https://drive.google.com/uc?export=download&id=1mK8ZSW0uG2aZ4Ubov2cD9fiLq8HR-ucb" -O Roms.rar
!unrar x Roms.rar roms/ > /dev/null # extract to roms/
!python -m atari_py.import_roms roms > /dev/null
2021-09-06 05:02:00 URL:https://doc-0k-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/fhnef1ullememhjojdm5rh74cule7nj6/1630904475000/09349901208138295014/*/1mK8ZSW0uG2aZ4Ubov2cD9fiLq8HR-ucb?e=download [11128004] -> "Roms.rar" [1] 2021-09-06 05:02:02 URL:https://raw.githubusercontent.com/openai/baselines/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/atari_wrappers.py [9686/9686] -> "openai_baselines/atari_wrappers.py" [1] 2021-09-06 05:02:02 URL:https://raw.githubusercontent.com/openai/baselines/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/wrappers.py [946/946] -> "openai_baselines/wrappers.py" [1]
import time
import random
import pathlib
import numpy as np
import matplotlib.pyplot as plt
import gym
import gym.spaces
from gym.wrappers import Monitor
import tensorflow as tf
ENV_NAME = 'PongNoFrameskip-v4'
# Smoke test
gym.make(ENV_NAME)
<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>
Обертки:
MaxAndSkipEnv
(Wrapper) -- меняет метод step
. Метод step
:PongFrameGrayscaleCrop
(ObservationWrapper) -- обрезает кадр до размера 80x80, конвертирует в grayscale.FrameStack
(ObservationWrapper) -- собирает 4 последовательных кадра в одно 4-х канальное изображение и возвращает как состояние state
.ScaledFloatFrame
(ObservationWrapper) -- байтовое представление изображения в дробное: [0, 255]
→ [0, 1]
LeftRightAction
(ActionWrapper) -- уменьшает пространство действий до двух: 0 - право, 1 - лево.#from openai_baselines.atari_wrappers import MaxAndSkipEnv
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class PongFrameGrayscaleCrop(gym.ObservationWrapper):
def __init__(self, env):
super().__init__(env)
self.observation_space = gym.spaces.Box(
low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)
def observation(self, obs):
return obs[34:-16:2, ::2, 2:3]
class FrameStack(gym.ObservationWrapper):
def __init__(self, env, n_steps, dtype=np.float32):
super().__init__(env)
self.dtype = dtype
old_space = env.observation_space # (80, 80, 1)
self.observation_space = gym.spaces.Box(
old_space.low.repeat(n_steps, axis=-1), # (80, 80, 4)
old_space.high.repeat(n_steps, axis=-1), dtype=dtype)
def reset(self):
self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
return self.observation(self.env.reset())
def observation(self, observation):
self.buffer[..., :-1] = self.buffer[..., 1:]
self.buffer[..., -1:] = observation
return self.buffer # (80, 80, 4)
class ScaledFloatFrame(gym.ObservationWrapper):
def observation(self, obs):
return np.array(obs).astype(np.float32) / 255.0
class LeftRightAction(gym.ActionWrapper):
def __init__(self, env):
super().__init__(env)
self.action_space = gym.spaces.Discrete(2)
def action(self, act):
""" 0 -> RIGHT (2), 1 -> LEFT (3) """
return [2, 3][act]
def make_env(env_name):
env = gym.make(env_name)
env = MaxAndSkipEnv(env, skip=4)
env = PongFrameGrayscaleCrop(env)
env = FrameStack(env, 4)
env = ScaledFloatFrame(env)
env = LeftRightAction(env)
return env
env = make_env('PongNoFrameskip-v4')
s = env.reset()
for i in range(25):
s = env.step(i%2)[0]
fig, axes = plt.subplots(ncols=s.shape[-1], figsize=(16, 4))
for i, ax in enumerate(axes):
ax.imshow(s[..., i], cmap='gray')
s.shape
# State after 25 steps
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
(80, 80, 4)
INPUT_SHAPE = (80, 80, 4)
NUM_ACTIONS = 2
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense
def new_model():
model = tf.keras.models.Sequential([
Input(INPUT_SHAPE),
Conv2D(32, (8,8), strides=4, activation='relu'),
Conv2D(64, (4,4), strides=2, activation='relu'),
Conv2D(64, (3,3), strides=1, activation='relu'),
Flatten(),
Dense(512, activation='relu'),
Dense(NUM_ACTIONS),
])
return model
model = new_model()
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d (Conv2D) (None, 19, 19, 32) 8224 _________________________________________________________________ conv2d_1 (Conv2D) (None, 8, 8, 64) 32832 _________________________________________________________________ conv2d_2 (Conv2D) (None, 6, 6, 64) 36928 _________________________________________________________________ flatten (Flatten) (None, 2304) 0 _________________________________________________________________ dense (Dense) (None, 512) 1180160 _________________________________________________________________ dense_1 (Dense) (None, 2) 1026 ================================================================= Total params: 1,259,170 Trainable params: 1,259,170 Non-trainable params: 0 _________________________________________________________________
# Smoke test
test_state = env.observation_space.sample()
model(test_state[np.newaxis, ...])
<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[53.27661 , 0.9582041]], dtype=float32)>
class ExperienceBuffer:
def __init__(self, max_len, state_shape, state_dtype):
"""
Numpy array based buffer.
Supported operations: append (push), random sample.
Unsupported operations: get, pop, insert, remove.
Stores (state, action, reward, next_state, done) tuples, where
state0 : ndarray of shape (h, w, c)
action : int32
reward : float32
state1 : ndarray of shape (h, w, c)
done : bool
"""
self.states0 = np.zeros((max_len, *state_shape), dtype=state_dtype)
self.states1 = np.zeros((max_len, *state_shape), dtype=state_dtype)
self.actions = np.zeros(max_len, dtype=np.int32)
self.rewards = np.zeros(max_len, dtype=np.float32)
self.dones = np.zeros(max_len, dtype=np.bool)
self.max_len = max_len
self.len = 0
self.head = 0
def __len__(self):
""" Current buffer length. Never exeeds max_len. """
return self.len
def append(self, state, action, reward, next_state, done):
i = self.head % self.max_len
self.states0[i] = state
self.actions[i] = action
self.rewards[i] = reward
self.dones[i] = done
self.states1[i] = next_state
self.head = i + 1
self.len = min(self.len + 1, self.max_len)
def sample(self, batch_size):
idxs = np.random.choice(range(self.len), batch_size, replace=False)
return (
self.states0[idxs],
self.actions[idxs],
self.rewards[idxs],
self.dones[idxs],
self.states1[idxs],
)
test_state = np.arange(6).reshape(1, 2, 3)
test_buffer = ExperienceBuffer(10, state_shape=(1, 2, 3), state_dtype=np.float32)
assert test_buffer.len == 0
test_buffer.append(test_state[:], 0, 0., test_state[:], False)
assert test_buffer.len == 1
{test_buffer.append(test_state[:], 0, 0., test_state[:], False) for i in range(15)}
assert test_buffer.len == 10
assert [batch.shape for batch in test_buffer.sample(4)] \
== [(4, 1, 2, 3), (4,), (4,), (4,), (4, 1, 2, 3)]
assert [batch.dtype for batch in test_buffer.sample(1)] \
== [np.float32, np.int32, np.float32, np.bool, np.float32]
class Agent:
""" Class for interactions with environment and replay buffer.
"""
def __init__(self, env, exp_buffer):
self.env = env
self.exp_buffer = exp_buffer
self._reset()
def _reset(self):
self.state = env.reset()
self.total_reward = 0.0
def play_step(self, model, epsilon=0.0):
done_reward = None
if np.random.random() < epsilon:
action = env.action_space.sample()
else:
Q_s = model(self.state[np.newaxis, ...])[0] # (NUM_ACTIONS,)
action = np.argmax(Q_s) # (int)
next_state, reward, done, _ = self.env.step(action)
self.total_reward += reward
self.exp_buffer.append(self.state, action, reward, next_state, done)
self.state = next_state
if done:
done_reward = self.total_reward
self._reset()
return done_reward
# Smoke test
test_env = make_env(ENV_NAME)
test_buffer = ExperienceBuffer(10, state_shape=INPUT_SHAPE, state_dtype=np.float32)
test_agent = Agent(env, test_buffer)
test_agent.play_step(model)
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
MEAN_REWARD_BOUND = 19.0
gamma = 0.99
batch_size = 32
replay_size = 10000
learning_rate = 1e-4
sync_target_frames = 1000
replay_start_size = 10000
eps_start = 1.0
eps_decay = .999985
eps_min = 0.02
root_path = pathlib.Path(ROOT)
model_path = root_path / MODEL_NAME
if model_path.exists():
model = tf.keras.models.load_model(model_path)
else:
model = new_model()
optimizer = tf.optimizers.Adam(learning_rate)
loss = tf.losses.Huber()
model.compile(optimizer, loss)
env = make_env(ENV_NAME)
target_model = tf.keras.models.clone_model(model) # clone architecture (weights not copied)
target_model.set_weights(model.get_weights()) # clone weights
buffer = ExperienceBuffer(replay_size,
state_shape=env.observation_space.shape,
state_dtype=env.observation_space.dtype)
agent = Agent(env, buffer)
epsilon = eps_start
total_rewards = []
frame_idx = 0
best_mean_reward = None
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
if MODEL_TRAINING:
while True:
frame_idx += 1
epsilon = max(epsilon*eps_decay, eps_min)
reward = agent.play_step(model, epsilon)
if reward is not None:
total_rewards.append(reward)
mean_reward = np.mean(total_rewards[-100:])
print("%d: %d games, mean reward %.3f, (epsilon %.2f)" %
(frame_idx, len(total_rewards), mean_reward, epsilon))
if best_mean_reward is None or best_mean_reward < mean_reward:
best_mean_reward = mean_reward
if best_mean_reward is not None:
print("Best mean reward updated %.3f" % (best_mean_reward))
try:
model.save(model_path)
except Exception as e:
print(f'Unable to save model to {model_path}.', repr(e))
if mean_reward > MEAN_REWARD_BOUND:
print("Solved in %d frames!" % frame_idx)
break
if len(buffer) < replay_start_size:
continue
batch = buffer.sample(batch_size)
states, actions, rewards, dones, next_states = batch
# Get quality score for each action in state `s'`
Q_next = target_model(next_states) # (batch, NUM_ACTIONS)
# Select max Q-value for each next state
next_state_values = np.max(Q_next, axis=1) # (batch,)
next_state_values[dones] = 0.0
expected_state_action_values = rewards + gamma * next_state_values
with tf.GradientTape() as tape:
# Get model predictions of Q-values
out = model(states) # (batch, NUM_ACTIONS)
state_action_values = tf.gather_nd(out, actions[:, None], batch_dims=1) # (batch,)
loss_value = loss(expected_state_action_values, state_action_values) # true, pred
grads = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
if frame_idx % sync_target_frames == 0:
target_model.set_weights(model.get_weights())
1379: 1 games, mean reward -19.000, (epsilon 0.98) Best mean reward updated -19.000 2325: 2 games, mean reward -20.000, (epsilon 0.97) 3591: 3 games, mean reward -19.000, (epsilon 0.95) 4537: 4 games, mean reward -19.500, (epsilon 0.93) 5483: 5 games, mean reward -19.800, (epsilon 0.92) 6709: 6 games, mean reward -19.833, (epsilon 0.90) 7595: 7 games, mean reward -20.000, (epsilon 0.89) 8541: 8 games, mean reward -20.125, (epsilon 0.88) 9644: 9 games, mean reward -20.111, (epsilon 0.87) 10775: 10 games, mean reward -20.200, (epsilon 0.85) 11723: 11 games, mean reward -20.273, (epsilon 0.84) 12922: 12 games, mean reward -20.167, (epsilon 0.82) 13936: 13 games, mean reward -20.077, (epsilon 0.81) 15134: 14 games, mean reward -20.000, (epsilon 0.80) 16175: 15 games, mean reward -20.000, (epsilon 0.78) 17158: 16 games, mean reward -20.000, (epsilon 0.77) 18317: 17 games, mean reward -20.000, (epsilon 0.76) 19708: 18 games, mean reward -19.833, (epsilon 0.74) 20652: 19 games, mean reward -19.895, (epsilon 0.73) 21814: 20 games, mean reward -19.900, (epsilon 0.72) 23063: 21 games, mean reward -19.952, (epsilon 0.71) 24067: 22 games, mean reward -20.000, (epsilon 0.70) 25388: 23 games, mean reward -19.957, (epsilon 0.68) 26456: 24 games, mean reward -20.000, (epsilon 0.67) 27903: 25 games, mean reward -19.880, (epsilon 0.66) 29232: 26 games, mean reward -19.769, (epsilon 0.65) 30518: 27 games, mean reward -19.778, (epsilon 0.63) 31679: 28 games, mean reward -19.786, (epsilon 0.62) 33396: 29 games, mean reward -19.724, (epsilon 0.61) 34883: 30 games, mean reward -19.767, (epsilon 0.59) 36194: 31 games, mean reward -19.806, (epsilon 0.58) 37380: 32 games, mean reward -19.844, (epsilon 0.57) 39130: 33 games, mean reward -19.758, (epsilon 0.56) 40890: 34 games, mean reward -19.618, (epsilon 0.54) 42678: 35 games, mean reward -19.514, (epsilon 0.53) 44387: 36 games, mean reward -19.528, (epsilon 0.51) 46320: 37 games, mean reward -19.459, (epsilon 0.50) 48096: 38 games, mean reward -19.421, (epsilon 0.49) 49926: 39 games, mean reward -19.256, (epsilon 0.47) 51673: 40 games, mean reward -19.250, (epsilon 0.46) 53114: 41 games, mean reward -19.244, (epsilon 0.45) 54890: 42 games, mean reward -19.214, (epsilon 0.44) 56728: 43 games, mean reward -19.186, (epsilon 0.43) 58424: 44 games, mean reward -19.136, (epsilon 0.42) 60141: 45 games, mean reward -19.111, (epsilon 0.41) 62623: 46 games, mean reward -19.022, (epsilon 0.39) 64128: 47 games, mean reward -19.021, (epsilon 0.38) 66667: 48 games, mean reward -18.979, (epsilon 0.37) Best mean reward updated -18.979 69179: 49 games, mean reward -18.918, (epsilon 0.35) Best mean reward updated -18.918 71771: 50 games, mean reward -18.780, (epsilon 0.34) Best mean reward updated -18.780 75002: 51 games, mean reward -18.627, (epsilon 0.32) Best mean reward updated -18.627 77848: 52 games, mean reward -18.423, (epsilon 0.31) Best mean reward updated -18.423 80481: 53 games, mean reward -18.377, (epsilon 0.30) Best mean reward updated -18.377 83493: 54 games, mean reward -18.222, (epsilon 0.29) Best mean reward updated -18.222 86431: 55 games, mean reward -18.182, (epsilon 0.27) Best mean reward updated -18.182 90202: 56 games, mean reward -17.929, (epsilon 0.26) Best mean reward updated -17.929 93674: 57 games, mean reward -17.807, (epsilon 0.25) Best mean reward updated -17.807 97405: 58 games, mean reward -17.707, (epsilon 0.23) Best mean reward updated -17.707 101096: 59 games, mean reward -17.542, (epsilon 0.22) Best mean reward updated -17.542 104468: 60 games, mean reward -17.450, (epsilon 0.21) Best mean reward updated -17.450 107665: 61 games, mean reward -17.361, (epsilon 0.20) Best mean reward updated -17.361 111144: 62 games, mean reward -16.952, (epsilon 0.19) Best mean reward updated -16.952 115524: 63 games, mean reward -16.746, (epsilon 0.18) Best mean reward updated -16.746 119284: 64 games, mean reward -16.500, (epsilon 0.17) Best mean reward updated -16.500 122894: 65 games, mean reward -16.354, (epsilon 0.16) Best mean reward updated -16.354 127027: 66 games, mean reward -16.167, (epsilon 0.15) Best mean reward updated -16.167 131446: 67 games, mean reward -15.970, (epsilon 0.14) Best mean reward updated -15.970 133618: 68 games, mean reward -16.015, (epsilon 0.13) 138033: 69 games, mean reward -15.855, (epsilon 0.13) Best mean reward updated -15.855 142283: 70 games, mean reward -15.743, (epsilon 0.12) Best mean reward updated -15.743 147587: 71 games, mean reward -15.507, (epsilon 0.11) Best mean reward updated -15.507 151038: 72 games, mean reward -15.167, (epsilon 0.10) Best mean reward updated -15.167 155839: 73 games, mean reward -14.918, (epsilon 0.10) Best mean reward updated -14.918 158918: 74 games, mean reward -14.500, (epsilon 0.09) Best mean reward updated -14.500 162679: 75 games, mean reward -14.120, (epsilon 0.09) Best mean reward updated -14.120 165730: 76 games, mean reward -13.737, (epsilon 0.08) Best mean reward updated -13.737 168441: 77 games, mean reward -13.714, (epsilon 0.08) Best mean reward updated -13.714 171174: 78 games, mean reward -13.385, (epsilon 0.08) Best mean reward updated -13.385 175422: 79 games, mean reward -13.127, (epsilon 0.07) Best mean reward updated -13.127 179651: 80 games, mean reward -12.887, (epsilon 0.07) Best mean reward updated -12.887 183152: 81 games, mean reward -12.531, (epsilon 0.06) Best mean reward updated -12.531 186111: 82 games, mean reward -12.183, (epsilon 0.06) Best mean reward updated -12.183 190600: 83 games, mean reward -11.952, (epsilon 0.06) Best mean reward updated -11.952 193937: 84 games, mean reward -11.643, (epsilon 0.05) Best mean reward updated -11.643 198586: 85 games, mean reward -11.435, (epsilon 0.05) Best mean reward updated -11.435 201869: 86 games, mean reward -11.163, (epsilon 0.05) Best mean reward updated -11.163 206503: 87 games, mean reward -11.000, (epsilon 0.05) Best mean reward updated -11.000 211416: 88 games, mean reward -10.795, (epsilon 0.04) Best mean reward updated -10.795 214673: 89 games, mean reward -10.528, (epsilon 0.04) Best mean reward updated -10.528 217600: 90 games, mean reward -10.222, (epsilon 0.04) Best mean reward updated -10.222 220469: 91 games, mean reward -9.923, (epsilon 0.04) Best mean reward updated -9.923 223785: 92 games, mean reward -9.674, (epsilon 0.03) Best mean reward updated -9.674 226832: 93 games, mean reward -9.387, (epsilon 0.03) Best mean reward updated -9.387 229700: 94 games, mean reward -9.106, (epsilon 0.03) Best mean reward updated -9.106 ...
if 'vdisplay' not in locals(): # run once
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install pyvirtualdisplay > /dev/null 2>&1
from pyvirtualdisplay import Display
vdisplay = Display(visible=0, size=(1400, 900))
vdisplay.start()
def show_video():
import pathlib
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
pwd = pathlib.Path()
mp4list = list(pwd.glob('video/*.mp4'))
if len(mp4list) > 0:
mp4_path = mp4list[0]
print(mp4_path)
video = mp4_path.read_bytes()
encoded = base64.b64encode(video)
ipythondisplay.display(HTML(data='''<video alt="test" autoplay
loop controls style="height: 400px;">
<source src="data:video/mp4;base64,{0}" type="video/mp4" />
</video>'''.format(encoded.decode('ascii'))))
else:
print("Could not find video")
model = tf.keras.models.load_model(model_path)
env_sim = Monitor(env, './video', force=True)
s = env_sim.reset()
totalReward = 0
for _ in range(2000):
env_sim.render()
a = np.argmax(model(s[None, ...])[0])
s, r, done, _ = env_sim.step(a)
totalReward += r
if r != 0:
print('New reward = {}'.format(r))
if done:
break
env_sim.close()
print('Total reward = {}'.format(totalReward))
New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = -1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 New reward = 1.0 Total reward = 17.0
show_video()
video/openaigym.video.6.61.video000000.mp4
Based on articles (2020-08-15):
Notebook with full code (pytorch):