Farama-Foundation / Gymnasium-Robotics

A collection of robotics simulation environments for reinforcement learning
https://robotics.farama.org/
MIT License
485 stars 79 forks source link

Sparse PointMaze Environment: Rewards Received but Not Terminated Issue #190

Closed sxzhuang closed 6 months ago

sxzhuang commented 7 months ago

Describe the bug I have trained a custom map for the PointMaze_UMaze-v3 environment. While testing the performance of my policy, I encountered a bug in this environment. Upon debugging my code, I discovered that when rewards==1 and infos[success]== True, the terminations==False. Below is my testing code (some details are omitted).

Code example

import gymnasium as gym

def make_env():
    def thunk():
        H_MAZE = [[1, 1, 1, 1, 1],
                [1, 0, 0, "g", 1],
                [1, 1, 0, 1, 1],
                [1, "r", 0, 0, 1],
                [1, 1, 1, 1, 1]]
        env = gym.make('PointMaze_UMaze-v3', maze_map=H_MAZE, max_episode_steps=200, render_mode="rgb_array")   
        env = gym.wrappers.FlattenObservation(env)  
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(1)
        return env
    return thunk

envs = gym.vector.SyncVectorEnv([make_env()])

# Load the policy
agent = Model(envs).to(device)
agent.load_state_dict(torch.load(model_path, map_location=device))
agent.eval()

obs, _ = envs.reset()
while True:
    with torch.no_grad():
        actions, _, _  = agent.get_action(torch.Tensor(obs).to(device))
        next_obs, rewards, terminations, _, infos = envs.step(actions.cpu().numpy())   

System Info gymnasium== 0.29.1

Checklist

Kallinteris-Andreas commented 7 months ago

Was truncation==True?

sxzhuang commented 7 months ago

Was truncation==True?

The value of truncation was False when reward was 1.0 in SparseReward Maze Env. I have shared the code and attached a model. I hope this can be helpful for reproducing the reported bug.

gymnasium-robotics == 1.2.3 gymnasium == 0.29.1

import gymnasium as gym
import torch
from dataclasses import dataclass
import os
import tyro
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

LOG_STD_MAX = 2
LOG_STD_MIN = -5

@dataclass
class Args:
    exp_name: str = os.path.basename(__file__)[: -len(".py")]
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""
    render: bool = False

    env_id: str = "Point-Env"
    """the environment id of the task"""

class Actor(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_mean = nn.Linear(256, np.prod(env.action_space.shape))
        self.fc_logstd = nn.Linear(256, np.prod(env.action_space.shape))
        # action rescaling
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32).unsqueeze(0)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32).unsqueeze(0)
        )

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mean = self.fc_mean(x)
        log_std = self.fc_logstd(x)
        log_std = torch.tanh(log_std)
        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats

        return mean, log_std

    def get_action(self, x):
        mean, log_std = self(x)
        std = log_std.exp()
        normal = torch.distributions.Normal(mean, std)
        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
        y_t = torch.tanh(x_t)
        action = y_t * self.action_scale + self.action_bias
        log_prob = normal.log_prob(x_t)
        # Enforcing Action Bound
        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
        log_prob = log_prob.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_prob, mean

def evaluate(
    model_path: str,
    seed: int,
    eval_episodes: int,
    Model: torch.nn.Module,
    device: torch.device = torch.device("cpu"),

):
    env = gym.make('PointMaze_Medium_Diverse_G-v3', max_episode_steps=300, render_mode="rgb_array")
    env = gym.wrappers.FlattenObservation(env)  # deal with dm_control's Dict observation space
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env.action_space.seed(seed)    

    agent = Model(env).to(device)
    agent.load_state_dict(torch.load(model_path, map_location=device))
    agent.eval()

    obs, _ = env.reset()
    episodic_returns = []
    step = 0
    rd_ep = 0.
    while len(episodic_returns) < eval_episodes:
        with torch.no_grad():
            actions, _, _  = agent.get_action(torch.Tensor(obs).unsqueeze(0).to(device))
            step += 1
            next_obs, rewards, terminations, truncations, infos = env.step(actions.cpu().numpy().squeeze())
            rd_ep += rewards
            # if rewards != 0:
            #     print(f"rewards: {rewards}; terminations: {terminations}, truncations:{truncations}")
            if terminations or truncations:
                episodic_returns.append(rd_ep)
                next_obs, _ = env.reset()
                rd_ep = 0.
            obs = next_obs

    return episodic_returns

args = tyro.cli(Args)
device = torch.device("cuda:0" if torch.cuda.is_available() and args.cuda else "cpu")

# Change to your model's path
model_path = 'test2._99999_model'

episodic_returns = evaluate(
    model_path,
    args.seed,
    eval_episodes=10,
    Model=Actor,
    device=device,
)
print(episodic_returns)

Here is the model's link

Kallinteris-Andreas commented 7 months ago

By default, the continuing_task=True environment argument is used, if you want to terminate when the goal is reached: use continuing_task=False.

https://robotics.farama.org/envs/maze/point_maze/#arguments

The documentation does not specify the default argument values of AntMaze and PointMaze if are willing, make PR to fix that