Closed sxzhuang closed 6 months ago
Was truncation==True
?
Was
truncation==True
?
The value of truncation was False when reward was 1.0 in SparseReward Maze Env. I have shared the code and attached a model. I hope this can be helpful for reproducing the reported bug.
gymnasium-robotics == 1.2.3
gymnasium == 0.29.1
import gymnasium as gym
import torch
from dataclasses import dataclass
import os
import tyro
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
LOG_STD_MAX = 2
LOG_STD_MIN = -5
@dataclass
class Args:
exp_name: str = os.path.basename(__file__)[: -len(".py")]
"""the name of this experiment"""
seed: int = 1
"""seed of the experiment"""
torch_deterministic: bool = True
"""if toggled, `torch.backends.cudnn.deterministic=False`"""
cuda: bool = True
"""if toggled, cuda will be enabled by default"""
capture_video: bool = False
"""whether to capture videos of the agent performances (check out `videos` folder)"""
render: bool = False
env_id: str = "Point-Env"
"""the environment id of the task"""
class Actor(nn.Module):
def __init__(self, env):
super().__init__()
self.fc1 = nn.Linear(np.array(env.observation_space.shape).prod(), 256)
self.fc2 = nn.Linear(256, 256)
self.fc_mean = nn.Linear(256, np.prod(env.action_space.shape))
self.fc_logstd = nn.Linear(256, np.prod(env.action_space.shape))
# action rescaling
self.register_buffer(
"action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32).unsqueeze(0)
)
self.register_buffer(
"action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32).unsqueeze(0)
)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
mean = self.fc_mean(x)
log_std = self.fc_logstd(x)
log_std = torch.tanh(log_std)
log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats
return mean, log_std
def get_action(self, x):
mean, log_std = self(x)
std = log_std.exp()
normal = torch.distributions.Normal(mean, std)
x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1))
y_t = torch.tanh(x_t)
action = y_t * self.action_scale + self.action_bias
log_prob = normal.log_prob(x_t)
# Enforcing Action Bound
log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
log_prob = log_prob.sum(1, keepdim=True)
mean = torch.tanh(mean) * self.action_scale + self.action_bias
return action, log_prob, mean
def evaluate(
model_path: str,
seed: int,
eval_episodes: int,
Model: torch.nn.Module,
device: torch.device = torch.device("cpu"),
):
env = gym.make('PointMaze_Medium_Diverse_G-v3', max_episode_steps=300, render_mode="rgb_array")
env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space
env = gym.wrappers.RecordEpisodeStatistics(env)
env.action_space.seed(seed)
agent = Model(env).to(device)
agent.load_state_dict(torch.load(model_path, map_location=device))
agent.eval()
obs, _ = env.reset()
episodic_returns = []
step = 0
rd_ep = 0.
while len(episodic_returns) < eval_episodes:
with torch.no_grad():
actions, _, _ = agent.get_action(torch.Tensor(obs).unsqueeze(0).to(device))
step += 1
next_obs, rewards, terminations, truncations, infos = env.step(actions.cpu().numpy().squeeze())
rd_ep += rewards
# if rewards != 0:
# print(f"rewards: {rewards}; terminations: {terminations}, truncations:{truncations}")
if terminations or truncations:
episodic_returns.append(rd_ep)
next_obs, _ = env.reset()
rd_ep = 0.
obs = next_obs
return episodic_returns
args = tyro.cli(Args)
device = torch.device("cuda:0" if torch.cuda.is_available() and args.cuda else "cpu")
# Change to your model's path
model_path = 'test2._99999_model'
episodic_returns = evaluate(
model_path,
args.seed,
eval_episodes=10,
Model=Actor,
device=device,
)
print(episodic_returns)
Here is the model's link
By default, the continuing_task=True
environment argument is used, if you want to terminate when the goal is reached:
use continuing_task=False
.
https://robotics.farama.org/envs/maze/point_maze/#arguments
The documentation does not specify the default argument values of AntMaze
and PointMaze
if are willing, make PR to fix that
Describe the bug I have trained a custom map for the PointMaze_UMaze-v3 environment. While testing the performance of my policy, I encountered a bug in this environment. Upon debugging my code, I discovered that when
rewards==1
andinfos[success]== True
, theterminations==False
. Below is my testing code (some details are omitted).Code example
System Info gymnasium== 0.29.1
Checklist