[TO INVESTIGATE] Potential Performance Drop after loading SAC/TQC model

araffin commented 3 years ago

I could not reproduce the bug on a simple env like Pendulum, I will try later with harder env like BidedalWalkerHardcore.

Steps to reproduce (using latest SB3 version and latest rl zoo version):

Train model
Save model and replay buffer
Load model and replay buffer
Continue training

I observed a performance drop after the first gradient updates. The good news is that it recovers the correct performance (more or less quickly) after. I suspect that we have an issue with the optimizer state (similar to https://github.com/DLR-RM/stable-baselines3/issues/391), where we load the optimizer but they are somehow not linked anymore to the correct variables after loading.

Additional context:

I'm using custom gym env from here (but should not be the problem):
I was using parallel training (may be the problem too)
I'm using train_freq = (1, "episode") (so the difference after each gradient update is more noticeable)

To try:

discard optimizer completely and see if there is any change
reproduce with a "classic" gym env

araffin commented 3 years ago

After further investigation, loading seems to be fine (even though some details like proper seeding is needed to have equivalent runs).

import gym
import torch as th
import numpy as np

from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3

th.set_num_threads(2)  # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 5000
N_STEPS_AFTER_SAVING = 5000
MODEL_CLASS = SAC

def create_model():
    model = MODEL_CLASS(
        "MlpPolicy",
        "Pendulum-v0",
        buffer_size=20000,
        train_freq=1,
        seed=1,
        verbose=1,
        learning_rate=1e-3,
        policy_kwargs=dict(net_arch=[64, 64]),
    )
    return model

def create_env():
    env = gym.make("Pendulum-v0")
    return env

# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()

#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####

model = create_model()
model.learn(total_timesteps=400)
model.learn(total_timesteps=400, reset_num_timesteps=False)
action_1, _ = model.predict(obs, deterministic=True)

model = create_model()
model.learn(total_timesteps=800)
action_2, _ = model.predict(obs, deterministic=True)

print(action_1, action_2)
assert np.allclose(action_1, action_2)

#### END TEST 1 ####

#### TEST 2: two model.learn() equivalent        ####
#### to model.learn() and then continue training ####

model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")

# Seed as it is done when loading a model
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)

# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)

# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")

# Check that loading the replay buffer worked
assert np.allclose(
    model1.replay_buffer.observations[:10, 0, 0],
    model.replay_buffer.observations[:10, 0, 0],
)

# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)

action_4, _ = model.predict(obs, deterministic=True)

print(action_3, action_4)

# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)

#### END TEST 2 ####

araffin commented 3 years ago

For BipedalWalkerHardcore-v3

import gym
import torch as th
import numpy as np

from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3

th.set_num_threads(2)  # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 2500
N_STEPS_AFTER_SAVING = 500
MODEL_CLASS = SAC
ENV_ID = "BipedalWalkerHardcore-v3"

def create_model():
    model = MODEL_CLASS(
        "MlpPolicy",
        ENV_ID,
        buffer_size=20000,
        train_freq=1,
        seed=1,
        verbose=1,
        learning_rate=1e-3,
        policy_kwargs=dict(net_arch=[64, 64]),
    )
    return model

def create_env():
    env = gym.make(ENV_ID)
    return env

# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()

#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####

# model = create_model()
# model.learn(total_timesteps=400)
# model.learn(total_timesteps=400, reset_num_timesteps=False)
# action_1, _ = model.predict(obs, deterministic=True)

# model = create_model()
# model.learn(total_timesteps=800)
# action_2, _ = model.predict(obs, deterministic=True)

# print(action_1, action_2)
# assert np.allclose(action_1, action_2)

#### END TEST 1 ####

#### TEST 2: two model.learn() equivalent        ####
#### to model.learn() and then continue training ####

model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")

# Seed as it is done when loading a model
model1.set_env(create_env())
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)

# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)

# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")

# Check that loading the replay buffer worked
assert np.allclose(
    model1.replay_buffer.observations[:10, 0, 0],
    model.replay_buffer.observations[:10, 0, 0],
)

# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)

action_4, _ = model.predict(obs, deterministic=True)

print(action_3, action_4)

# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)

#### END TEST 2 ####

DLR-RM / stable-baselines3

[TO INVESTIGATE] Potential Performance Drop after loading SAC/TQC model #435