Closed araffin closed 3 years ago
After further investigation, loading seems to be fine (even though some details like proper seeding is needed to have equivalent runs).
import gym
import torch as th
import numpy as np
from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3
th.set_num_threads(2) # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 5000
N_STEPS_AFTER_SAVING = 5000
MODEL_CLASS = SAC
def create_model():
model = MODEL_CLASS(
"MlpPolicy",
"Pendulum-v0",
buffer_size=20000,
train_freq=1,
seed=1,
verbose=1,
learning_rate=1e-3,
policy_kwargs=dict(net_arch=[64, 64]),
)
return model
def create_env():
env = gym.make("Pendulum-v0")
return env
# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()
#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####
model = create_model()
model.learn(total_timesteps=400)
model.learn(total_timesteps=400, reset_num_timesteps=False)
action_1, _ = model.predict(obs, deterministic=True)
model = create_model()
model.learn(total_timesteps=800)
action_2, _ = model.predict(obs, deterministic=True)
print(action_1, action_2)
assert np.allclose(action_1, action_2)
#### END TEST 1 ####
#### TEST 2: two model.learn() equivalent ####
#### to model.learn() and then continue training ####
model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")
# Seed as it is done when loading a model
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)
# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)
# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")
# Check that loading the replay buffer worked
assert np.allclose(
model1.replay_buffer.observations[:10, 0, 0],
model.replay_buffer.observations[:10, 0, 0],
)
# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_4, _ = model.predict(obs, deterministic=True)
print(action_3, action_4)
# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)
#### END TEST 2 ####
For BipedalWalkerHardcore-v3
import gym
import torch as th
import numpy as np
from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3
th.set_num_threads(2) # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 2500
N_STEPS_AFTER_SAVING = 500
MODEL_CLASS = SAC
ENV_ID = "BipedalWalkerHardcore-v3"
def create_model():
model = MODEL_CLASS(
"MlpPolicy",
ENV_ID,
buffer_size=20000,
train_freq=1,
seed=1,
verbose=1,
learning_rate=1e-3,
policy_kwargs=dict(net_arch=[64, 64]),
)
return model
def create_env():
env = gym.make(ENV_ID)
return env
# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()
#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####
# model = create_model()
# model.learn(total_timesteps=400)
# model.learn(total_timesteps=400, reset_num_timesteps=False)
# action_1, _ = model.predict(obs, deterministic=True)
# model = create_model()
# model.learn(total_timesteps=800)
# action_2, _ = model.predict(obs, deterministic=True)
# print(action_1, action_2)
# assert np.allclose(action_1, action_2)
#### END TEST 1 ####
#### TEST 2: two model.learn() equivalent ####
#### to model.learn() and then continue training ####
model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")
# Seed as it is done when loading a model
model1.set_env(create_env())
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)
# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)
# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")
# Check that loading the replay buffer worked
assert np.allclose(
model1.replay_buffer.observations[:10, 0, 0],
model.replay_buffer.observations[:10, 0, 0],
)
# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_4, _ = model.predict(obs, deterministic=True)
print(action_3, action_4)
# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)
#### END TEST 2 ####
I could not reproduce the bug on a simple env like Pendulum, I will try later with harder env like BidedalWalkerHardcore.
Steps to reproduce (using latest SB3 version and latest rl zoo version):
I observed a performance drop after the first gradient updates. The good news is that it recovers the correct performance (more or less quickly) after. I suspect that we have an issue with the optimizer state (similar to https://github.com/DLR-RM/stable-baselines3/issues/391), where we load the optimizer but they are somehow not linked anymore to the correct variables after loading.
Related: https://github.com/DLR-RM/stable-baselines3/issues/29 and https://github.com/DLR-RM/stable-baselines3/issues/51
Additional context:
train_freq = (1, "episode")
(so the difference after each gradient update is more noticeable)To try: