For the rollout and total episodes, they're represented in scientific notation, which is harder to read at a glance, even though they're not long enough to require it. You can see that it makes the values longer than they otherwise would be, and that for say total/steps a longer number is represented there. Also none of the other RL methods do this. I know this is minor, but it's annoying enough to be worth fixing. Can someone please look into it?
Code to reproduce:
import gym
import numpy as np
import warnings
import os
from stable_baselines.results_plotter import load_results, ts2xy
best_mean_reward, n_steps = -np.inf, 0
def callback(_locals, _globals):
"""
Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
:param _locals: (dict)
:param _globals: (dict)
"""
global n_steps, best_mean_reward
# Print stats every 1000 calls
if (n_steps + 1) % 1000 == 0:
# Evaluate policy training performance
x, y = ts2xy(load_results(log_dir), 'timesteps')
if len(x) > 0:
mean_reward = np.mean(y[-100:])
"""
print(x[-1], 'timesteps')
print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))
"""
# New best model, you could save the agent here
if mean_reward > best_mean_reward:
best_mean_reward = mean_reward
# Example for saving best model
print("Saving new best model")
_locals['self'].save(log_dir + 'best_model.pkl')
n_steps += 1
return True
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)
from stable_baselines.ddpg.policies import MlpPolicy
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
from stable_baselines import DDPG
from stable_baselines.bench import Monitor
env = gym.make('MountainCarContinuous-v0')
env.seed(42)
env = Monitor(env, log_dir, allow_early_resets=True)
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log='./ddpg_tensorboard')
model.learn(total_timesteps=400000, callback=callback)
obs = env.reset()
while True:
action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)
env.render()
env.close()
I think this display may come from this line where the number of episodes becomes a float.
Feel free to submit a PR if you think this is useful. I consider this as really minor and would recommend you to use SAC or TD3 anyway.
Here's an example intermittent print out from DDPG:
For the rollout and total episodes, they're represented in scientific notation, which is harder to read at a glance, even though they're not long enough to require it. You can see that it makes the values longer than they otherwise would be, and that for say total/steps a longer number is represented there. Also none of the other RL methods do this. I know this is minor, but it's annoying enough to be worth fixing. Can someone please look into it?
Code to reproduce: