`NAN` in the training - Githubissues

I implement the A2C algorithm to train the Panda Gym PandaReachDense-v3 env, but I got nan when I trained the model. I didn't use the SB3 because I wanted to implement from scratch. Below is my code:
import gymnasium as gym
import numpy as np
# these are new packages in this file
import panda_gym
import stable_baselines3
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from collections import deque
import matplotlib.pyplot as plt
import random
import imageio
import tensorboard
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline
env = make_vec_env(env_id, n_envs=1)

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

obs = env.reset()

achieved_goal = obs['achieved_goal']
desired_goal = obs['desired_goal']
observation = obs['observation']

merged_array = np.concatenate([achieved_goal, desired_goal, observation], axis=-1)
merged_array.shape
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# we don't use the Categorical since the actions are continuous
# we should use the Normal to model the continous distribution
from torch.distributions import Categorical,Normal
# Actor module, categorical actions only
class Actor(nn.Module):
    def __init__(self, state_space, action_space,lr,device):
        super(Actor,self).__init__()
        # self.model = nn.Sequential(
        #     nn.Linear(state_space, 32),
        #     nn.ReLU(),
        #     nn.Linear(32, 32),
        #     nn.ReLU(),
        #     nn.Linear(32, action_space * 2), #the acition space is continous
        # )
        self.fc1 = nn.Linear(state_space, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, action_space * 2)

        for layer in [self.fc1, self.fc2, self.fc3]:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)

        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.to(device)

    def forward(self, X):
        X = F.relu(self.fc1(t(X)))
        X = F.relu(self.fc2(X))
        out = self.fc3(X)
        return out
# Critic module
class Critic(nn.Module):
    def __init__(self, state_space,lr,device):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_space, 64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.to(device)

    def forward(self, X):
        return self.model(t(X))
class Agent():
    def __init__(
        self,
        env: gym.envs,
        state_space: int,
        action_space: int,
        lr: float,
        device: torch.device,
        gamma: float, # discounted rewards
        n_training_episodes: int,
        n_eval_episodes: int,
        max_t: int
    ):
        # init the variables
        self.env = env
        self.state_space = state_space
        self.action_space = action_space
        self.lr = lr
        self.device = device
        self.gamma = gamma
        self.n_training_episodes = n_training_episodes
        self.n_eval_episodes = n_eval_episodes
        self.max_t = max_t

        # self.actor_critic = ActorCritic(state_space,action_space,
        #                                 lr,device)
        self.actor = Actor(state_space,action_space,lr,device)
        self.critic = Critic(state_space,lr,device)

    def act(self,state):
        # choose an action given a state
        ## Firstly, we should convert the state to torch.tensor since we need use GPU to train
        ## This is new to the code because the input state is a Dict
        ## The dict includes 'observation', 'achieved_goal', 'desired_goal'
        # the operation of sampling action is different with previous codes
        # we need use the Normal() to model the distribution using mean and log_std
        # then, sample the action
        act_out = self.actor(state)
        mean = act_out[:,:self.action_space]
        log_std = act_out[:,self.action_space:]
        # convert log_std to std
        std = torch.exp(log_std)
        normal = Normal(mean,std)
        action = normal.sample()
        log_prob = normal.log_prob(action)
        return action.numpy(),log_prob

    def step(self,action):
        # the agent take the action and obtain the next observation and immedient reward
        next_obs,reward,done,info = self.env.step(action)
        return next_obs, reward, done

    def update(self,advantage,log_prob):
        """this is different with previous codes because we need to update two networks at each time_step"""
        # Note that Variable 'advantages' must be tensor
        # policy_loss = -torch.mean(advantage * log_prob)
        # loss_array = -advantage.detach() * log_prob
        # loss = torch.mean(loss_array)
        advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)

        policy_loss = torch.mean(-advantage.detach() * log_prob)

        writer.add_scalar('policy_loss',policy_loss)

        self.actor.optimizer.zero_grad()
        policy_loss.backward()
        self.actor.optimizer.step()

        critic_loss = advantage.pow(2).mean()

        writer.add_scalar('critc_loss',critic_loss)

        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()

    def train(self):
        """the agent will play many episodes and update it's parameters"""
        scores_deque = deque(maxlen=100)
        scores = []
        avg_scores = []

        for episode in range(self.n_training_episodes):
            # init some lists to store the infos
            rewards = []
            # in this method, we don't need to init saved_log_probs and rewards
            # since we update the network at each timestep
            # only need the immedient reward and predition of value

            # init the env where there isn't info
            state = self.env.reset()

            # loop max_t
            for t in range(self.max_t):
                # sample an action
                action,log_prob = self.act(state)
                # stepforward
                next_obs,reward,done = self.step(action)
                rewards.append(reward)

                writer.add_scalar('timestep reward',reward)

                # compute the advantage function
                # A = r + gamma * V(next) - V(now)
                with torch.no_grad():
                    value = self.critic(state)
                    value_ = self.critic(next_obs)
                advantage = torch.tensor(reward).float() + self.gamma * value_ * (1 - int(done)) - value
                advantage.requires_grad = True
                self.update(advantage,log_prob)

                if done:
                    break

                state = next_obs

            scores_deque.append(sum(rewards))
            writer.add_scalar('ep_reward',sum(rewards),episode)
            scores.append(sum(rewards))
            avg_scores.append(np.mean(scores_deque))
            writer.add_scalar('avg_reward',np.mean(scores_deque))

            if episode % 100 == 0:
                print(f'episode: {episode} | average score: {np.mean(scores_deque)}')

        fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
        fig.suptitle('Training Progress')

        ax1.plot(scores, label='Episode Scores', color='b')
        ax1.set_xlabel('Episode')
        ax1.set_ylabel('scores', color='b')
        ax1.tick_params(axis='y', labelcolor='b')

        ax2.plot(avg_scores, label='Episode Average Score', color='r')
        ax2.set_xlabel('Episode')
        ax2.set_ylabel('Average Score', color='r')
        ax2.tick_params(axis='y', labelcolor='r')

    # add the evaluate function
    def evaluate(self):
        episode_rewards = []

        for episode in range(self.n_eval_episodes):
            # init the state
            state = self.env.reset()
            total_rewards_ep = 0

            for t in range(self.max_t):
                action,_ = self.act(state)
                next_obs,reward,done = self.step(action)
                total_rewards_ep += reward
                if done:
                    break

                state = next_obs

            episode_rewards.append(total_rewards_ep)

        # mean and std of rewards
        mean_rewards = np.mean(episode_rewards)
        std_rewards = np.std(episode_rewards)

        return mean_rewards,std_rewards

    def record_video(self,save_path,fps=1):
        imgs = []
        done = False
        state = self.env.reset()
        img = self.env.render()
        imgs.append(img)
        while not done:
            action,_ = self.act(state)
            next_obs,reward,done = self.step(action)
            img = self.env.render()
            imgs.append(img)
        imageio.mimsave(save_path, [np.array(img) for i, img in enumerate(imgs)], fps=fps)
state_space = merged_array.shape[-1]
action_space = env.action_space.shape[0]
lr = 0.001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gamma = 0.9
n_training_episodes = 2000
n_eval_episodes = 200
max_t = 10000

RoboticsAgent = Agent(
    env,
    state_space,
    action_space,
    lr,
    device,
    gamma,
    n_training_episodes,
    n_eval_episodes,
    max_t
)
RoboticsAgent.train()
There is the error: The whole notebook link is here: https://github.com/accuracy-maker/robotics-tutorial/blob/main/Robotics_A2C.ipynb
qgallouedec / panda-gym

`NAN` in the training #78