Describe the bug
Loading a DDPG agent when trained using normalized observation or normalized returns does not work. The trained agent does not have the correct critic and the correct policy. This is because the non trainable params used for normalization are not saved and loaded (the scopes are "obs_rms" and "ret_rms")
Code example
import gym
import numpy as np
from stable_baselines.ddpg.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
from stable_baselines import DDPG
SAVED_MODEL = "Pendulum-v0-normalized"
def evaluate(model, num_steps=1000):
"""
Evaluate a RL agent
:param model: (BaseRLModel object) the RL Agent
:param num_steps: (int) number of timesteps to evaluate it
:return: (float) Mean reward for the last 100 episodes
"""
episode_rewards = [0.0]
obs = env.reset()
for i in range(num_steps):
# _states are only useful when using LSTM policies
action, _states = model.predict(obs)
# here, action, rewards and dones are arrays
# because we are using vectorized env
obs, rewards, dones, info = env.step(action)
# Stats
episode_rewards[-1] += rewards[0]
if dones[0]:
obs = env.reset()
episode_rewards.append(0.0)
# Compute mean reward for the last 100 episodes
mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
print("Mean reward:", mean_100ep_reward,
"Num episodes:", len(episode_rewards))
return mean_100ep_reward
def callback(_locals, _globals):
"""
Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
:param _locals: (dict)
:param _globals: (dict)
"""
if _locals["done"]:
print(_locals["step"], np.mean(
_locals["epoch_episode_rewards"]), _locals["episode_reward"])
return True
if __name__ == '__main__':
# Environment
env = gym.make('Pendulum-v0')
env = DummyVecEnv([lambda: env])
# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(
mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
# Create model
model = DDPG(MlpPolicy, env, verbose=1,
param_noise=param_noise,
action_noise=action_noise,
normalize_observations=True,
normalize_returns=True)
# Evaluation before training
print("Before training")
mean_reward_before_train = evaluate(model, num_steps=10000)
# Train and save
model.learn(total_timesteps=500000, callback=callback)
model.save(SAVED_MODEL)
del model
# Evaluation after training
print("After training")
model = DDPG.load(SAVED_MODEL)
evaluate(model, num_steps=10000)
This outputs:
Before training
Mean reward: -1160.2 Num episodes: 51
After training
Mean reward: -1413.1 Num episodes: 51
while the mean reward is -200 if I use normalize_observations=False and normalize_returns=False.
I fixed it by inserting
self.obs_rms_params = [var for var in tf.global_variables() if "obs_rms" in var.name]
self.ret_rms_params = [var for var in tf.global_variables() if "ret_rms" in var.name]
Describe the bug Loading a DDPG agent when trained using normalized observation or normalized returns does not work. The trained agent does not have the correct critic and the correct policy. This is because the non trainable params used for normalization are not saved and loaded (the scopes are "obs_rms" and "ret_rms")
Code example
This outputs:
while the mean reward is -200 if I use
normalize_observations=False
andnormalize_returns=False
.I fixed it by inserting
in the
setup_model
method.Then i changed the saving method to have this:
and the loading method to have this: