[Bug]: unsupported operand for +: 'float' and 'NoneType' during PPO Training with Custom DSSAT Gym Wrapper

🐛 Bug

During the training of a PPO agent using the Stable Baselines3 library in conjunction with a custom Gym environment wrapper (GymDssatWrapper) for the DSSAT agriculture simulation, an error occurs at the point of stepping through the environment (env.step(action)). Despite confirming that the reward values returned from the environment are never None (they are always a float and never 0), the following TypeError is encountered: TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'.

This issue arises after implementing a custom wrapper to normalize actions and observations for the DSSAT environment. The bug manifests during the learn method execution of the PPO algorithm, specifically when accumulating rewards within the Monitor wrapper from Stable Baselines3, suggesting that somehow a None type reward is being processed, contradicting the verified behavior of the environment where rewards are confirmed to be valid float values.

The traceback indicates the error occurs deep within the Stable Baselines3 collect_rollouts method, when actions are applied to the environment and rewards are collected. This suggests that the issue may lie in the interaction between the custom Gym wrapper and the monitoring or vectorization utilities provided by Stable Baselines3, potentially in how rewards are handled or reported back through these layers.

Further investigation and clarification are needed to identify the exact cause of this discrepancy and resolve the TypeError, ensuring stable and correct training behavior for the PPO agent within this custom DSSAT environment setup.

To Reproduce

import datetime
import os

import gym
import matplotlib.pyplot as plt

import gym_dssat_pdi
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import wandb
from codecarbon import EmissionsTracker
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env

from wrappers import GymDssatWrapper
from src.models.agents import NullAgent, ExpertAgent

def main():
    init_folders()
    tracker = init_tracker()

    train_steps = 4000
    eval_steps = 40

    env_args = {
        'mode': 'fertilization',
        'seed': 123,
        'random_weather': True,
    }
    env = init_env(env_args)

    ppo_args = {
        'gamma': 1,
        'learning_rate': 0.0003,
        'seed': 123,
    }
    run = init_run(ppo_args)

    ppo_agent = train_agent(
        env=env,
        ppo_args=ppo_args,
        train_steps=train_steps,
    )

    # evaluate_agents(
    #     env=env,
    #     run=run,
    #     tracker=tracker,
    #     ppo_agent=ppo_agent,
    #     eval_steps=eval_steps
    # )

    tracker.stop()
    env.close()

def init_env(env_args):
    # Create DSSAT env
    # env = GymDssatWrapper(gym.make('GymDssatPdi-v0', **env_args))
    env = make_vec_env('GymDssatPdi-v0', wrapper_class=GymDssatWrapper, env_kwargs=env_args, n_envs=4)
    # env = gym.wrappers.RecordEpisodeStatistics(env)
    # env = VecNormalize(env, norm_obs=True, norm_reward=True)

    return env

def init_run(ppo_args):
    run = wandb.init(
        entity='aqsone-lab',
        project='LAB6-agridrl',
        config=ppo_args,
        sync_tensorboard=True,
        save_code=True
    )

    return run

def init_tracker():
    tracker = EmissionsTracker(log_level='warning')
    tracker.start()

    return tracker

# evaluation and plotting functions
def evaluate(agent, n_episodes=10):
    # Create eval env
    eval_args = {
        'mode': 'fertilization',
        'seed': 456,
        'random_weather': True,
    }

    env = Monitor(GymDssatWrapper(gym.make('GymDssatPdi-v0', **eval_args)))
    returns, _ = evaluate_policy(agent, env, n_eval_episodes=n_episodes, return_episode_rewards=True)

    return returns

def plot_results(labels, returns):
    data_dict = {}
    for label, data in zip(labels, returns):
        data_dict[label] = data
    df = pd.DataFrame(data_dict)

    ax = sns.boxplot(data=df)
    ax.set_xlabel('policy')
    ax.set_ylabel('evaluation output')
    plt.savefig('reports/figures/results_sb3.pdf')
    print('\nThe result is saved in the reports/figures/ directory as "results_sb3.pdf"\n')
    plt.show()

def train_agent(env, ppo_args, train_steps=400000):
    # Create PPO agent
    ppo_agent = PPO('MlpPolicy', env, device='cuda', tensorboard_log='models/runs/ppo', verbose=1, **ppo_args)

    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    #     ppo_agent.learn(total_timesteps=train_steps)
    #     print(prof.key_averages().table(sort_by="cuda_time_total"))

    # # Train our agent
    ppo_agent.learn(total_timesteps=train_steps)

    # Define the current timestamp
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    # Save our agent
    ppo_model_path = f'models/ppo_model_{timestamp}'
    ppo_agent.save(ppo_model_path)

    return ppo_agent

def evaluate_agents(env, run, tracker, ppo_agent, eval_steps):
    # Evaluate agents
    null_agent = NullAgent(env)
    print('Evaluating Null agent...')
    null_returns = evaluate(null_agent, n_episodes=eval_steps)
    print('Done')

    print('Evaluating PPO agent...')
    ppo_returns = evaluate(ppo_agent, n_episodes=eval_steps)
    print('Done')

    expert_agent = ExpertAgent(env)
    print('Evaluating Expert agent...')
    expert_returns = evaluate(expert_agent, n_episodes=eval_steps)
    print('Done')

    # Display results
    labels = ['null', 'ppo', 'expert']
    returns = [null_returns, ppo_returns, expert_returns]

    # Create DataFrame from returns
    data_dict = {label: data for label, data in zip(labels, returns)}
    df = pd.DataFrame(data_dict)

    # Create boxplot using Seaborn
    ax = sns.boxplot(data=df)
    ax.set_xlabel('Policy')
    ax.set_ylabel('Evaluation Output')
    plt.savefig('reports/figures/results_sb3.pdf')  # Save the boxplot as a PDF
    plt.close()  # Close the plot to prevent it from being displayed

    # Convert Seaborn boxplot to Plotly
    fig = go.Figure()
    for i, label in enumerate(labels):
        fig.add_trace(go.Box(y=df[label], name=label))

    # Log Plotly figure
    run.log({'rewards distributions': fig})

    # Log other metrics
    run.log({
        'carbon_emission': tracker.flush(),
        'project_type': 'lab',
        'Project': 'B06-S04',
        'techno_type': 'DS',
        'techno': 'DRL',
        'phase': 'training',
    })
    run.finish()

    codecarbon_run = wandb.init(entity='aqsone-lab', project='CodeCarbon')

    codecarbon_run.log({
        'carbon_emission': tracker.flush(),
        'project_type': 'lab',
        'Project': 'B06-S04',
        'techno_type': 'DS',
        'techno': 'DRL',
        'phase': 'training',
    })
    codecarbon_run.finish()

def init_folders():
    # create models/runs/ppo
    os.makedirs(os.path.join('models', 'runs'), exist_ok=True)
    os.makedirs(os.path.join('models', 'runs', 'ppo'), exist_ok=True)

    # create reports/figures
    os.makedirs('reports', exist_ok=True)
    os.makedirs(os.path.join('reports', 'figures'), exist_ok=True)

if __name__ == '__main__':
    print(gym_dssat_pdi)
    main()

import gym
import numpy as np
import logging

logging.basicConfig(level=logging.DEBUG)

class GymDssatWrapper(gym.Wrapper):
    def __init__(self, env):
        super(GymDssatWrapper, self).__init__(env)

        # Assuming 'anfer' is a single-dimensional action space
        self.action_low, self.action_high = self.get_action_space_bounds()

        # Normalized action space to [-1, 1]
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

        # Assuming the environment is reset here to fetch the initial observation's shape after transformation
        obs_shape = self.transform_observation(env.reset()).shape
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        env.reset()  # Reset the environment again after initial observation to ensure proper start

    def get_action_space_bounds(self):
        box = self.env.action_space['anfer']
        return box.low, box.high

    def transform_observation(self, observation):
        # Assuming observation is a dictionary and needs to be flattened into a numeric array
        # Adapt this example based on the actual structure of your observations
        if isinstance(observation, dict):
            transformed_observation = np.concatenate([np.array(v).flatten() for v in observation.values()])
        else:
            transformed_observation = np.array(observation)
        return transformed_observation

    def normalize_action(self, action):
        normalized_action = ((action + 1) / 2) * (self.action_high - self.action_low) + self.action_low
        return float(normalized_action)  # Ensure the output is a scalar float

    def step(self, action):
        action = self.normalize_action(action)
        obs, reward, done, info = self.env.step({'anfer': action})
        transformed_obs = self.transform_observation(obs)

        return transformed_obs, reward, done, info

    def reset(self, **kwargs):
        initial_obs = self.env.reset(**kwargs)
        return self.transform_observation(initial_obs)

    def seed(self, seed=None):
        self.env.seed(seed)

Relevant log output / Error message

Traceback (most recent call last):
  File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 215, in <module>
    main()
  File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 43, in main
    ppo_agent = train_agent(
  File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 128, in train_agent
    ppo_agent.learn(total_timesteps=train_steps)
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/ppo/ppo.py", line 310, in learn
    return super().learn(
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 247, in learn
    continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 175, in collect_rollouts
    new_obs, rewards, dones, infos = env.step(clipped_actions)
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_vec_env.py", line 162, in step
    return self.step_wait()
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/vec_env/dummy_vec_env.py", line 43, in step_wait
    obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] = self.envs[env_idx].step(
  File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/wrappers.py", line 42, in step
    obs, reward, done, info = self.env.step({'anfer': action})
  File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/monitor.py", line 94, in step
    ep_rew = sum(self.rewards)
TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

System Info

No response

Checklist

[X] My issue does not relate to a custom gym environment. (Use the custom gym env template instead)
[X] I have checked that there is no similar issue in the repo
[X] I have read the documentation
[X] I have provided a minimal and working example to reproduce the bug
[X] I've used the markdown code blocks for both code and stack traces.

DLR-RM / stable-baselines3