Closed janblumenkamp closed 2 years ago
It seems to work in Ray 0.8.3
Unfortunately, the problem also occurs in older versions, it just took longer for it to occur. I tried different algorithms and both Torch and Tensorflow. My most recent test:
import time
import numpy as np
import yaml
import ray
from ray import tune
from ray.tune.registry import register_env
import gym
from gym.spaces import Box
class DemoEnv2(gym.Env):
def __init__(self):
self.action_space = Box(-np.inf, np.inf, shape=(1,), dtype=np.float64)
self.observation_space = Box(-np.inf, np.inf, shape=(1,), dtype=np.float64)
self.reset()
def reset(self):
self.pos = np.array([0.0], dtype=np.float64)
self.goal = np.random.uniform(-3, 3, 1)
self.cnt_timesteps_goal_reached = 0
return self.step([0.0])[0]
def step(self, action):
assert(not np.isnan(action))
self.pos += action
reward = 0
dist_to_goal = self.pos - self.goal
if dist_to_goal < 0.5:
self.cnt_timesteps_goal_reached += 1
reward = 5
else:
self.cnt_timesteps_goal_reached = 0
done = dist_to_goal > 10 or self.cnt_timesteps_goal_reached > 30
state = self.pos - self.goal
assert(not np.isnan(state) and not np.isnan(reward))
return state, reward, done, {}
if __name__ == '__main__':
register_env("demo", lambda _: DemoEnv2())
ray.init()
def train_a3c(use_pytorch):
tune.run("A3C", config={
"use_pytorch": use_pytorch,
"num_workers": 8,
"rollout_fragment_length": 20,
"use_pytorch": False,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"gamma": 0.99,
"grad_clip": 40.0,
"lambda": 1.0,
"lr": 0.0001,
"observation_filter": "NoFilter",
"num_gpus": 1,
"env": "demo",
"model": {
"fcnet_activation": "relu",
"fcnet_hiddens": [32, 32]
}
})
def train_ppo(use_pytorch):
tune.run(
"PPO",
checkpoint_freq=10,
config={
"use_pytorch": use_pytorch,
"env": "demo",
"lambda": 0.95,
"kl_coeff": 0.5,
"clip_rewards": True,
"clip_param": 0.2,
"vf_clip_param": 10.0,
"entropy_coeff": 0.01,
"train_batch_size": 5000,
"sample_batch_size": 100,
"sgd_minibatch_size": 500,
"num_sgd_iter": 10,
"num_workers": 8,
"num_envs_per_worker": 10,
"lr": 1e-4,
"gamma": 0.9,
"batch_mode": "truncate_episodes",
"observation_filter": "NoFilter",
"num_gpus": 1,
"model": {
"fcnet_activation": "relu",
"fcnet_hiddens": [32, 32],
}
}
)
train_ppo(True) # Trial 1
#train_ppo(False) # Trial 2
#train_a3c(True) # Trial 3
#train_a3c(False) # Trial 4
Trial 1: (PPO, Torch); training doesn't even start:
File "[...]/ray/tune/trial_runner.py", line 467, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "[...]/ray/tune/ray_trial_executor.py", line 381, in fetch_result
result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
File "[...]/ray/worker.py", line 1513, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::PPO.__init__() (pid=29023, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 414, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 450, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/rllib/agents/trainer_template.py", line 90, in __init__
Trainer.__init__(self, config, env, logger_creator)
File "[...]/ray/rllib/agents/trainer.py", line 455, in __init__
super().__init__(config, logger_creator)
File "[...]/ray/tune/trainable.py", line 174, in __init__
self._setup(copy.deepcopy(self.config))
File "[...]/ray/rllib/agents/trainer.py", line 596, in _setup
self._init(self.config, self.env_creator)
File "[...]/ray/rllib/agents/trainer_template.py", line 117, in _init
self.config["num_workers"])
File "[...]/ray/rllib/agents/trainer.py", line 667, in _make_workers
logdir=self.logdir)
File "[...]/ray/rllib/evaluation/worker_set.py", line 62, in __init__
RolloutWorker, env_creator, policy, 0, self._local_config)
File "[...]/ray/rllib/evaluation/worker_set.py", line 272, in _make_worker
_fake_sampler=config.get("_fake_sampler", False))
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 375, in __init__
policy_dict, policy_config)
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 842, in _build_policy_map
policy_map[name] = cls(obs_space, act_space, merged_conf)
File "[...]/ray/rllib/policy/torch_policy_template.py", line 87, in __init__
framework="torch")
File "[...]/ray/rllib/models/catalog.py", line 354, in get_model_v2
obs_space, action_space, num_outputs, model_config, name)
File "[...]/ray/rllib/models/catalog.py", line 471, in _get_default_torch_model_v2
from ray.rllib.models.torch.fcnet import (FullyConnectedNetwork as
File "[...]/ray/rllib/models/torch/fcnet.py", line 5, in <module>
from ray.rllib.models.torch.misc import normc_initializer, SlimFC, \
File "[...]/ray/rllib/models/torch/misc.py", line 62, in <module>
class SlimConv2d(nn.Module):
File "[...]/ray/rllib/models/torch/misc.py", line 71, in SlimConv2d
initializer=nn.init.xavier_uniform_,
AttributeError: 'NNStub' object has no attribute 'init'
Trial 2 (PPO, Tensorflow); runs for a few iterations, then fails due to nan:
2020-04-09 14:56:38,574 ERROR trial_runner.py:521 -- Trial PPO_demo_00000: Error processing event.
Traceback (most recent call last):
File "[...]/ray/tune/trial_runner.py", line 467, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "[...]/ray/tune/ray_trial_executor.py", line 381, in fetch_result
result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
File "[...]/ray/worker.py", line 1513, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AssertionError): ray::PPO.train() (pid=37031, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/rllib/agents/trainer.py", line 502, in train
raise e
File "[...]/ray/rllib/agents/trainer.py", line 491, in train
result = Trainable.train(self)
File "[...]/ray/tune/trainable.py", line 261, in train
result = self._train()
File "[...]/ray/rllib/agents/trainer_template.py", line 150, in _train
fetches = self.optimizer.step()
File "[...]/ray/rllib/optimizers/multi_gpu_optimizer.py", line 139, in step self.train_batch_size)
File "[...]/ray/rllib/optimizers/rollout.py", line 25, in collect_samples
next_sample = ray_get_and_free(fut_sample)
File "[...]/ray/rllib/utils/memory.py", line 29, in ray_get_and_free
result = ray.get(object_ids)
ray.exceptions.RayTaskError(AssertionError): ray::RolloutWorker.sample() (pid=37026, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 492, in sample
batches = [self.input_reader.next()]
File "[...]/ray/rllib/evaluation/sampler.py", line 53, in next
batches = [self.get_data()]
File "[...]/ray/rllib/evaluation/sampler.py", line 96, in get_data
item = next(self.rollout_provider)
File "[...]/ray/rllib/evaluation/sampler.py", line 367, in _env_runner
base_env.send_actions(actions_to_send)
File "[...]/ray/rllib/env/base_env.py", line 328, in send_actions
self.vector_env.vector_step(action_vector)
File "[...]/ray/rllib/env/vector_env.py", line 106, in vector_step
obs, r, done, info = self.envs[i].step(actions[i])
File "train_demo.py", line 24, in step
assert(not np.isnan(action))
AssertionError
Trial 3 (A3C, Torch); runs for a few iterations, then fails due to died sampling process:
Traceback (most recent call last):
File "[...]/ray/tune/trial_runner.py", line 467, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "[...]/ray/tune/ray_trial_executor.py", line 381, in fetch_result
result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
File "[...]/ray/worker.py", line 1513, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::A3C.train() (pid=38175, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/rllib/agents/trainer.py", line 502, in train
raise e
File "[...]/ray/rllib/agents/trainer.py", line 491, in train
result = Trainable.train(self)
File "[...]/ray/tune/trainable.py", line 261, in train
result = self._train()
File "[...]/ray/rllib/agents/trainer_template.py", line 142, in _train
return self._train_exec_impl()
File "[...]/ray/rllib/agents/trainer_template.py", line 174, in _train_exec_impl
res = next(self.train_exec_impl)
File "[...]/ray/util/iter.py", line 634, in __next__
return next(self.built_iterator)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 685, in apply_filter
for item in it:
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 670, in add_wait_hooks
item = next(it)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 470, in base_iterator
yield ray.get(obj_id)
ray.exceptions.RayTaskError(RuntimeError): ray::RolloutWorker.par_iter_next() (pid=38172, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/util/iter.py", line 961, in par_iter_next
return next(self.local_it)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 251, in gen_rollouts
yield self.sample()
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 492, in sample
batches = [self.input_reader.next()]
File "[...]/ray/rllib/evaluation/sampler.py", line 53, in next
batches = [self.get_data()]
File "[...]/ray/rllib/evaluation/sampler.py", line 198, in get_data
raise RuntimeError("Sampling thread has died")
RuntimeError: Sampling thread has died
Trial 4 (A3C, TensorFlow) same as previous:
File "[...]/ray/tune/trial_runner.py", line 467, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "[...]/ray/tune/ray_trial_executor.py", line 381, in fetch_result
result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
File "[...]/ray/worker.py", line 1513, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::A3C.train() (pid=38826, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/rllib/agents/trainer.py", line 502, in train
raise e
File "[...]/ray/rllib/agents/trainer.py", line 491, in train
result = Trainable.train(self)
File "[...]/ray/tune/trainable.py", line 261, in train
result = self._train()
File "[...]/ray/rllib/agents/trainer_template.py", line 142, in _train
return self._train_exec_impl()
File "[...]/ray/rllib/agents/trainer_template.py", line 174, in _train_exec_impl
res = next(self.train_exec_impl)
File "[...]/ray/util/iter.py", line 634, in __next__
return next(self.built_iterator)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 685, in apply_filter
for item in it:
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 670, in add_wait_hooks
item = next(it)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/util/iter.py", line 470, in base_iterator
yield ray.get(obj_id)
ray.exceptions.RayTaskError(RuntimeError): ray::RolloutWorker.par_iter_next() (pid=38831, ip=128.232.69.20)
File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 407, in ray._raylet.execute_task.function_executor
File "[...]/ray/util/iter.py", line 961, in par_iter_next
return next(self.local_it)
File "[...]/ray/util/iter.py", line 644, in apply_foreach
for item in it:
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 251, in gen_rollouts
yield self.sample()
File "[...]/ray/rllib/evaluation/rollout_worker.py", line 492, in sample
batches = [self.input_reader.next()]
File "[...]/ray/rllib/evaluation/sampler.py", line 53, in next
batches = [self.get_data()]
File "[...]/ray/rllib/evaluation/sampler.py", line 198, in get_data
raise RuntimeError("Sampling thread has died")
RuntimeError: Sampling thread has died
Given that all these four tests don't work, it seems most likely that something is wrong with my setup or my environment, I don't see what might be the problem with the environment.
EDIT: I took a look at the progress.csv and it looks like total_loss
is suddenly nan, policy_loss
is getting very small, vf_loss
huge, vf_explained_var
0 and both kl
and entropy
nan:
progress.zip
I tried to apply #7609 to Ray 0.8.4 and specified bounds for the action space. It doesn't crash anymore but the reward drops significantly after a few iterations and total_loss
, kl
and entropy
become nan
again (refer progress.zip)
@sven1977 Do you have an idea what might cause this? If you have a starting point I'd be happy to look into it.
Sorry for the delay. Taking a look now. ...
Hmm, I think it's simply your +/- inf states (observation space does not matter here b/c you don't use it in your env) that the env produces sometimes. Your state outputs are directly calculated from your actions (without any bounds checking), so there is definitely a lot of opportunity here for such instabilities.
No problem, thanks for taking a look! I also tried a bounded action space and I think the same thing happened. The stable-baselines claim that it is always best to normalize the action space from -1 to 1 and then map from that range to whatever is needed, does that also make sense for RLlib? Eventually, I used stable-baselines for this problem and there it worked, but maybe also because of the changes I made on the way. I will try the final environment I used there again in RLlib and see if it works then.
Yeah, it absolutely makes sense. We are currently simply clipping actions to the given bounds. There is a related issue that I would like to fix on PPO and the difference in action spaces [0, 1.0] vs [-1.0, 1.0], tc... Our PPO could simply use a SquashedGaussian or BetaDistribution for these spaces, which should all work better than clipping. We'll fix this.
I would like to follow up on this since I am having similar troubles now, a different environment though in a more complex model facilitating differential communication channels. I observe that after a seemingly random training time, the log probabilities computed with the logp function of the action distribution become -inf. My action space is bounded to something small like [-1, 1], but for some reason the predicted actions that cause these -inf logps grow to the order of 10e10. Do you have an idea what could cause this? Maybe I am missing something obvious.
Having similar issues with
LARGE_POSITIVE_VALUE = np.finfo(np.float32).max
self.observation_space = spaces.Box(low=-LARGE_POSITIVE_VALUE,
high=LARGE_POSITIVE_VALUE,
shape=(1,))
self.action_space = spaces.Box(low=0, high=5)
Actually A3C seems to have the same issue - it seems to be related to setting to the entropy_coeff > 0. @janblumenkamp Id reopen this issue - I havent been able to find the source of the issue @sven1977
I have this issue primarily with a custom trainer, but I was able to reproduce it with standard PPO. This is my current setup. The issue seems to occur after a random time, sometimes very quickly, sometimes it takes longer, which makes it really hard to debug. The environment consists of two differential drive robots, the action space is a lateral and angular speed for each. Each agent has fixed start and goals, the goal is fo each agent to move to its own goal. The observation is the own world position, the relative goal position and the current speed:
import gym
import numpy as np
import ray
import time
import yaml
from ray import tune
from ray.tune.registry import register_env
from scipy.spatial.transform import Rotation as R
class Agent():
CONFIG = {
'limits': {
'forward_speed': (-0.2, 0.2),
'yaw_rate': (-np.pi/8, np.pi/8),
},
}
def __init__(self, reset_pos, goal_pos):
self.initial_pos = self.position = np.array(reset_pos)
self.goal_pos = np.array(goal_pos)
self.reset()
def reset(self):
self.orientation = R.from_euler("z", 90, degrees=True)
self.position = self.initial_pos.copy()
self.closest_dist_to_goal = np.sqrt(np.sum((self.position - self.goal_pos)**2))
self.setpoint_forward_speed = 0
self.setpoint_yaw_rate = 0
def set_velocity(self, angular, lateral):
if np.isnan(angular) or np.isnan(lateral):
import pdb; pdb.set_trace()
self.setpoint_forward_speed = np.clip(lateral, *self.CONFIG['limits']['forward_speed'])
self.setpoint_yaw_rate = np.clip(angular, *self.CONFIG['limits']['yaw_rate'])
def get_rotation_matrix(self):
return self.orientation.as_matrix()
def step(self):
dt=1/24
self.position += self.orientation.apply(np.array([self.setpoint_forward_speed, 0, 0]) * dt)
self.orientation *= R.from_euler("xyz", np.array([0, 0, -self.setpoint_yaw_rate]) * dt)
m = self.get_rotation_matrix()
goal_relative = (self.goal_pos - self.position) @ m
speed = np.array([self.setpoint_forward_speed, -self.setpoint_yaw_rate])
obs = np.hstack([self.position, goal_relative, speed])
dst_goal = np.sqrt(np.sum(goal_relative**2))
reward = 0
if dst_goal < self.closest_dist_to_goal:
reward = self.closest_dist_to_goal - dst_goal
self.closest_dist_to_goal = dst_goal
done = dst_goal < 0.1
if not np.all(np.isfinite(obs)) or not np.isfinite(reward):
import pdb; pdb.set_trace()
return obs, reward, done
class SimpleEnv(gym.Env):
def __init__(self, config):
self.cfg = config
n_agents = len(self.cfg['agent_poses'])
self.action_space = gym.spaces.Box(low=np.array([-np.pi/8, -0.2]*n_agents), high=np.array([np.pi/8, 0.2]*n_agents), shape=(2*n_agents,), dtype=float)
self.observation_space = gym.spaces.Box(-10000, 10000, shape=(8*n_agents,), dtype=float)
self.robots = []
for initial_pose, goal_pose in zip(self.cfg['agent_poses'], self.cfg['agent_goals']):
self.robots.append(Agent(initial_pose, goal_pose))
self.reset()
def reset(self):
self.timestep = 0
for robot in self.robots:
robot.reset()
return self.step([0, 0]*len(self.robots))[0]
def step(self, actions):
self.timestep += 1
obs, dones = [], []
reward = 0
for i, (robot, action) in enumerate(zip(self.robots, np.array(actions).reshape(-1, len(self.robots)))):
robot.set_velocity(action[0], action[1])
o, r, d = robot.step()
obs.append(o)
dones.append(d)
reward += r
done = all(dones) or self.timestep > self.cfg['max_time_steps']
return np.concatenate(obs), reward, done, {}
if __name__ == '__main__':
register_env("simple_env", lambda config: SimpleEnv(config))
ray.init()
tune.run(
"PPO",
checkpoint_freq=1,
keep_checkpoints_num=2,
config={
"framework": "torch",
"_use_trajectory_view_api": False,
"env": "simple_env",
"lambda": 0.95,
"kl_coeff": 0.5,
"clip_rewards": True,
"clip_param": 0.2,
"entropy_coeff": 0.01,
"train_batch_size": 100,
"sgd_minibatch_size": 32,
"num_sgd_iter": 10,
"num_workers": 7,
"num_envs_per_worker": 16,
"lr": 1e-4,
"gamma": 0.99,
"batch_mode": "truncate_episodes",
"observation_filter": "NoFilter",
"num_gpus": 1,
"model": {
"fcnet_activation": "relu",
"fcnet_hiddens": [128, 256, 128, 32],
},
"env_config": {
'wall': True,
'agent_poses': [
[-0.3, -0.5, 0],
[0.3, -0.5, 0],
],
'agent_goals': [
[0.3, 0.5, 0],
[-0.3, 0.5, 0]
],
'max_time_steps': 3000,
'communication_range': 2.0,
'render': False,
}})
I have uploaded two logging csv here (progress.zip). The training always interrupts with the last iteration having a nan loss.
Ive noticed if you set entropy_coeff
to 0 - the error should dissapear
So, if the entropy coefficient is 0, doesn't that mean that there is no exploration and training might get stuck in local minima?
Right - i only mention it as a possible line of investigation to the solution
I see. I ran a few more experiments and unfortunately this does not work for me.
@dmadeka, are you using TensorFlow or PyTorch?
@janblumenkamp did you make any further progress with this? I am having the same issue. Action NaN at a random time using APPO in a multi-agent environment with Python 3.8, PyTorch. Thanks.
I actually don't think this is a systematic error in the implementation, but rather occurs due to a poor choice of hyperparameters (as dmdeka suggested, have a look at the entropy coefficient and maybe also at batch sizes). I also found it has something to do with the reward. I changed my reward function from a sparse to a dense/shaped reward and that helped a lot. I guess something like this still shouldn't happen. I also noticed that training in TensorFlow seems to be more stable.
Thanks. This makes sense to me as my reward is currently very sparse and I get the same action NaN (with varying levels of severity) when trying other continuous action algorithms (A2C, PG, PPO, etc) with default parameters. Seems unlikely that all algorithms would have implementation errors...
I am running into a similar issue: Training with SAC on a continuous actions space and getting nan's in the predicted actions after a varying amount of time, making me wonder if this is some instability. Training on the same environment with Stable-baselines3 I did not experience this, making me think that this is not due to the environment.
A bit different to the above is that this does not seem to happen with PPO (so far). Actions are always in the -1, 1 range, so running with normalize_actions=True. Otherwise very close to the default configuration (only adapting batch_size, tau, buffer_size). Rewards are also bounded within [-2, 0]. The only interesting thing I saw in the metrics is a sudden drop in the alpha loss:
Having quite a hard time to track down what could be the cause of this.
I recently encountered this error in my own project. In my case, it seems like it was related to a mismatch between the choice of activation function and the bounds of the action space. I mistakenly used relu
(the default for SAC) when some other logic in my project assumed tanh
.
I don't think my example is the typical way people would encounter this error, but I'm mentioning it as a heads-up for others.
Just to follow up on this, I have recently worked more on continuous environment and this issue occurred again. I am pretty sure that this originated in NaNs in the reward, that propagated to the model and eventually to the action. I think it would make sense to assert rewards not being nan on RLLib side, or is this something that should be up to the user? Perhaps at least throw a warning that can be disabled?
Another follow-up: There must be an issue deep in RLLib that causes the gradients to become zero. This patch replaces nans in the gradient with zero in Ray 1.13.0:
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
index 2b95a2157..5a3cd0d0e 100644
--- a/rllib/policy/torch_policy.py
+++ b/rllib/policy/torch_policy.py
@@ -1106,6 +1106,8 @@ class TorchPolicy(Policy):
for param_idx, param in enumerate(parameters):
if param_idx in param_indices:
if param.grad is not None:
+ # Bugfix: Set all nan grads to zero
+ param.grad = torch.nan_to_num(param.grad)
grads.append(param.grad)
all_grads[param_idx] = param.grad
Apply it with patch -p1 /path/to/python/dist-packages/ray/rllib/policy/torch_policy.py ./patch.patch
This, at least temporarily, seems to resolve this issue.
Is this solved permanently? I am also using PPO with sparse rewards and my action distribution receives nans. I am on rllib 2.4.0 but the problem still persists.
I am getting nan with an APPO - wondering why this issue was closed originally.
I am also getting a similar issue with rllib 2.32, where the loc parameter of the torch action distribution is all nans.
What is the problem?
After a few training iteration, the action becomes nan. I am honestly not sure if I did something wrong in the environment or the config?
Ray version and other system information (Python version, TensorFlow version, OS):
Reproduction (REQUIRED)
Output:
If I run it on the latest wheel, I get the following error: