DLR-RM / stable-baselines3

PyTorch version of Stable Baselines, reliable implementations of reinforcement learning algorithms.
https://stable-baselines3.readthedocs.io
MIT License
9.13k stars 1.7k forks source link

[TO INVESTIGATE] Potential Performance Drop after loading SAC/TQC model #435

Closed araffin closed 3 years ago

araffin commented 3 years ago

I could not reproduce the bug on a simple env like Pendulum, I will try later with harder env like BidedalWalkerHardcore.

Steps to reproduce (using latest SB3 version and latest rl zoo version):

  1. Train model
  2. Save model and replay buffer
  3. Load model and replay buffer
  4. Continue training

I observed a performance drop after the first gradient updates. The good news is that it recovers the correct performance (more or less quickly) after. I suspect that we have an issue with the optimizer state (similar to https://github.com/DLR-RM/stable-baselines3/issues/391), where we load the optimizer but they are somehow not linked anymore to the correct variables after loading.

Related: https://github.com/DLR-RM/stable-baselines3/issues/29 and https://github.com/DLR-RM/stable-baselines3/issues/51

Additional context:

To try:

araffin commented 3 years ago

After further investigation, loading seems to be fine (even though some details like proper seeding is needed to have equivalent runs).

import gym
import torch as th
import numpy as np

from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3

th.set_num_threads(2)  # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 5000
N_STEPS_AFTER_SAVING = 5000
MODEL_CLASS = SAC

def create_model():
    model = MODEL_CLASS(
        "MlpPolicy",
        "Pendulum-v0",
        buffer_size=20000,
        train_freq=1,
        seed=1,
        verbose=1,
        learning_rate=1e-3,
        policy_kwargs=dict(net_arch=[64, 64]),
    )
    return model

def create_env():
    env = gym.make("Pendulum-v0")
    return env

# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()

#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####

model = create_model()
model.learn(total_timesteps=400)
model.learn(total_timesteps=400, reset_num_timesteps=False)
action_1, _ = model.predict(obs, deterministic=True)

model = create_model()
model.learn(total_timesteps=800)
action_2, _ = model.predict(obs, deterministic=True)

print(action_1, action_2)
assert np.allclose(action_1, action_2)

#### END TEST 1 ####

#### TEST 2: two model.learn() equivalent        ####
#### to model.learn() and then continue training ####

model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")

# Seed as it is done when loading a model
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)

# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)

# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")

# Check that loading the replay buffer worked
assert np.allclose(
    model1.replay_buffer.observations[:10, 0, 0],
    model.replay_buffer.observations[:10, 0, 0],
)

# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)

action_4, _ = model.predict(obs, deterministic=True)

print(action_3, action_4)

# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)

#### END TEST 2 ####
araffin commented 3 years ago

For BipedalWalkerHardcore-v3

import gym
import torch as th
import numpy as np

from sb3_contrib import TQC
from stable_baselines3 import SAC, TD3

th.set_num_threads(2)  # faster run on cpu
SEED_AFTER_LOADING = 42
N_STEPS_BEFORE_SAVING = 2500
N_STEPS_AFTER_SAVING = 500
MODEL_CLASS = SAC
ENV_ID = "BipedalWalkerHardcore-v3"

def create_model():
    model = MODEL_CLASS(
        "MlpPolicy",
        ENV_ID,
        buffer_size=20000,
        train_freq=1,
        seed=1,
        verbose=1,
        learning_rate=1e-3,
        policy_kwargs=dict(net_arch=[64, 64]),
    )
    return model

def create_env():
    env = gym.make(ENV_ID)
    return env

# Pendulum has a timelimit of 200 steps
env = create_env()
obs = env.reset()

#### TEST 1: model.learn(800) equivalent to two model.learn(400) ####

# model = create_model()
# model.learn(total_timesteps=400)
# model.learn(total_timesteps=400, reset_num_timesteps=False)
# action_1, _ = model.predict(obs, deterministic=True)

# model = create_model()
# model.learn(total_timesteps=800)
# action_2, _ = model.predict(obs, deterministic=True)

# print(action_1, action_2)
# assert np.allclose(action_1, action_2)

#### END TEST 1 ####

#### TEST 2: two model.learn() equivalent        ####
#### to model.learn() and then continue training ####

model1 = create_model()
model1.learn(total_timesteps=N_STEPS_BEFORE_SAVING)
# Save current state of the model
model1.save("sac_pendulum")
model1.save_replay_buffer("sac_pendulum_buffer")

# Seed as it is done when loading a model
model1.set_env(create_env())
model1.set_random_seed(SEED_AFTER_LOADING)
model1.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)
action_3, _ = model1.predict(obs, deterministic=True)

# Different to TEST 1 because of reset_num_timesteps=True
# and of the seeding
# print(action_1, action_3)

# Load SAC model and continue training
model = MODEL_CLASS.load("sac_pendulum", env=create_env())
model.load_replay_buffer("sac_pendulum_buffer")

# Check that loading the replay buffer worked
assert np.allclose(
    model1.replay_buffer.observations[:10, 0, 0],
    model.replay_buffer.observations[:10, 0, 0],
)

# NOTE: load(env=...) is different from load() and then set_env()
# because of setting the random seed
# model.set_env(create_env())
model.set_random_seed(SEED_AFTER_LOADING)
model.learn(total_timesteps=N_STEPS_AFTER_SAVING, reset_num_timesteps=True)

action_4, _ = model.predict(obs, deterministic=True)

print(action_3, action_4)

# Check that the two runs are the same (same action leads to same observation)
assert np.allclose(model1.replay_buffer.observations, model.replay_buffer.observations)
assert np.allclose(action_3, action_4)

#### END TEST 2 ####