ValueError: The parameter loc has invalid values

Describe the bug Took the script from https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py in order to get a minimal hyper-parameter tuning script. I modified it for SAC by taking the respective sampling function from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py However I do not get this to run successfully, it ends up with the following error, which I simply can't figure out:

[W 2021-08-31 18:03:34,301] Trial 2 failed because of the following error: ValueError('The parameter loc has invalid values')
Traceback (most recent call last):
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 216, in _run_trial
    value_or_values = func(trial)
  File "sac-demo.py", line 138, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/sac.py", line 289, in learn
    return super(SAC, self).learn(
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/off_policy_algorithm.py", line 369, in learn
    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/sac.py", line 203, in train
    actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/policies.py", line 190, in action_log_prob
    return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 244, in log_prob_from_params
    action = self.actions_from_params(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 178, in actions_from_params
    self.proba_distribution(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 210, in proba_distribution
    super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 152, in proba_distribution
    self.distribution = Normal(mean_actions, action_std)
  File "/home/rl/.local/lib/python3.8/site-packages/torch/distributions/normal.py", line 50, in __init__
    super(Normal, self).__init__(batch_shape, validate_args=validate_args)
  File "/home/rl/.local/lib/python3.8/site-packages/torch/distributions/distribution.py", line 53, in __init__
    raise ValueError("The parameter {} has invalid values".format(param))
ValueError: The parameter loc has invalid values
Traceback (most recent call last):
  File "sac-demo.py", line 170, in <module>
    study.optimize(objective, n_trials=N_TRIALS-len(df)+1) # timeout=600
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/study.py", line 401, in optimize
    _optimize(
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 65, in _optimize
    _optimize_sequential(
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 162, in _optimize_sequential
    trial = _run_trial(study, func, catch)
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 267, in _run_trial
    raise func_err
  File "/home/rl/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 216, in _run_trial
    value_or_values = func(trial)
  File "sac-demo.py", line 138, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/sac.py", line 289, in learn
    return super(SAC, self).learn(
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/off_policy_algorithm.py", line 369, in learn
    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/sac.py", line 203, in train
    actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/sac/policies.py", line 190, in action_log_prob
    return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 244, in log_prob_from_params
    action = self.actions_from_params(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 178, in actions_from_params
    self.proba_distribution(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 210, in proba_distribution
    super(SquashedDiagGaussianDistribution, self).proba_distribution(mean_actions, log_std)
  File "/home/rl/.local/lib/python3.8/site-packages/stable_baselines3/common/distributions.py", line 152, in proba_distribution
    self.distribution = Normal(mean_actions, action_std)
  File "/home/rl/.local/lib/python3.8/site-packages/torch/distributions/normal.py", line 50, in __init__
    super(Normal, self).__init__(batch_shape, validate_args=validate_args)
  File "/home/rl/.local/lib/python3.8/site-packages/torch/distributions/distribution.py", line 53, in __init__
    raise ValueError("The parameter {} has invalid values".format(param))
ValueError: The parameter loc has invalid values

I would be very glad if someone could help me figure out whats going wrong here.

Code example

# modified from https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py

from typing import Any
from typing import Dict

import gym
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback
import torch
import torch.nn as nn

N_TRIALS = 50 #100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 5
N_TIMESTEPS = int(5e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 1

ENV_ID = "Pendulum-v0"

# modified from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py
def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for SAC hyperparams.
    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e3), int(1e4), int(1e5), int(1e6)])
    learning_starts = trial.suggest_categorical("learning_starts", [0, 1000, 10000, 20000])
    # train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300])
    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
    # gradient_steps takes too much time
    # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300])
    gradient_steps = train_freq
    # ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001])
    ent_coef = "auto"
    # You can comment that out when not using gSDE
    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        # Uncomment for tuning HER
        # "large": [256, 256, 256],
        # "verybig": [512, 512, 512],
    }[net_arch]

    target_entropy = "auto"
    # if ent_coef == 'auto':
    #     # target_entropy = trial.suggest_categorical('target_entropy', ['auto', 5, 1, 0, -1, -5, -10, -20, -50])
    #     target_entropy = trial.suggest_uniform('target_entropy', -10, 10)

    hyperparams = {
        "gamma": gamma,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "learning_starts": learning_starts,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "ent_coef": ent_coef,
        "tau": tau,
        "target_entropy": target_entropy,
        "policy_kwargs": dict(log_std_init=log_std_init, net_arch=net_arch),
    }

    return hyperparams

class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

def objective(trial: optuna.Trial) -> float:

    # Sample hyperparameters
    kwargs = sample_sac_params(trial)
    # Create the RL model
    env = gym.make(ENV_ID)

    model = SAC("MlpPolicy", env, **kwargs)
    # Create env used for evaluation
    eval_env = gym.make(ENV_ID)
    # Create the callback that will periodically evaluate
    # and report the performance
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize", study_name=f"SAC-{ENV_ID}", storage=f'sqlite:///SAC-{ENV_ID}.db', load_if_exists=True)
    df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
    print(df)
    try:
        study.optimize(objective, n_trials=N_TRIALS-len(df)+1) # timeout=600
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

System Info Describe the characteristic of your environment:

Describe how Stable Baselines3 was installed: pip
GPU models and configuration: NVIDIA GeForce GTX 1060 6GB/PCIe/SSE2
Python version: 3.8.10
PyTorch version: 1.9.0
Versions of any other relevant libraries: gym 0.18.3

DLR-RM / rl-baselines3-zoo

ValueError: The parameter loc has invalid values #156