During the training of a PPO agent using the Stable Baselines3 library in conjunction with a custom Gym environment wrapper (GymDssatWrapper) for the DSSAT agriculture simulation, an error occurs at the point of stepping through the environment (env.step(action)). Despite confirming that the reward values returned from the environment are never None (they are always a float and never 0), the following TypeError is encountered: TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'.
This issue arises after implementing a custom wrapper to normalize actions and observations for the DSSAT environment. The bug manifests during the learn method execution of the PPO algorithm, specifically when accumulating rewards within the Monitor wrapper from Stable Baselines3, suggesting that somehow a None type reward is being processed, contradicting the verified behavior of the environment where rewards are confirmed to be valid float values.
The traceback indicates the error occurs deep within the Stable Baselines3 collect_rollouts method, when actions are applied to the environment and rewards are collected. This suggests that the issue may lie in the interaction between the custom Gym wrapper and the monitoring or vectorization utilities provided by Stable Baselines3, potentially in how rewards are handled or reported back through these layers.
Further investigation and clarification are needed to identify the exact cause of this discrepancy and resolve the TypeError, ensuring stable and correct training behavior for the PPO agent within this custom DSSAT environment setup.
To Reproduce
import datetime
import os
import gym
import matplotlib.pyplot as plt
import gym_dssat_pdi
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import wandb
from codecarbon import EmissionsTracker
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
from wrappers import GymDssatWrapper
from src.models.agents import NullAgent, ExpertAgent
def main():
init_folders()
tracker = init_tracker()
train_steps = 4000
eval_steps = 40
env_args = {
'mode': 'fertilization',
'seed': 123,
'random_weather': True,
}
env = init_env(env_args)
ppo_args = {
'gamma': 1,
'learning_rate': 0.0003,
'seed': 123,
}
run = init_run(ppo_args)
ppo_agent = train_agent(
env=env,
ppo_args=ppo_args,
train_steps=train_steps,
)
# evaluate_agents(
# env=env,
# run=run,
# tracker=tracker,
# ppo_agent=ppo_agent,
# eval_steps=eval_steps
# )
tracker.stop()
env.close()
def init_env(env_args):
# Create DSSAT env
# env = GymDssatWrapper(gym.make('GymDssatPdi-v0', **env_args))
env = make_vec_env('GymDssatPdi-v0', wrapper_class=GymDssatWrapper, env_kwargs=env_args, n_envs=4)
# env = gym.wrappers.RecordEpisodeStatistics(env)
# env = VecNormalize(env, norm_obs=True, norm_reward=True)
return env
def init_run(ppo_args):
run = wandb.init(
entity='aqsone-lab',
project='LAB6-agridrl',
config=ppo_args,
sync_tensorboard=True,
save_code=True
)
return run
def init_tracker():
tracker = EmissionsTracker(log_level='warning')
tracker.start()
return tracker
# evaluation and plotting functions
def evaluate(agent, n_episodes=10):
# Create eval env
eval_args = {
'mode': 'fertilization',
'seed': 456,
'random_weather': True,
}
env = Monitor(GymDssatWrapper(gym.make('GymDssatPdi-v0', **eval_args)))
returns, _ = evaluate_policy(agent, env, n_eval_episodes=n_episodes, return_episode_rewards=True)
return returns
def plot_results(labels, returns):
data_dict = {}
for label, data in zip(labels, returns):
data_dict[label] = data
df = pd.DataFrame(data_dict)
ax = sns.boxplot(data=df)
ax.set_xlabel('policy')
ax.set_ylabel('evaluation output')
plt.savefig('reports/figures/results_sb3.pdf')
print('\nThe result is saved in the reports/figures/ directory as "results_sb3.pdf"\n')
plt.show()
def train_agent(env, ppo_args, train_steps=400000):
# Create PPO agent
ppo_agent = PPO('MlpPolicy', env, device='cuda', tensorboard_log='models/runs/ppo', verbose=1, **ppo_args)
# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
# ppo_agent.learn(total_timesteps=train_steps)
# print(prof.key_averages().table(sort_by="cuda_time_total"))
# # Train our agent
ppo_agent.learn(total_timesteps=train_steps)
# Define the current timestamp
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Save our agent
ppo_model_path = f'models/ppo_model_{timestamp}'
ppo_agent.save(ppo_model_path)
return ppo_agent
def evaluate_agents(env, run, tracker, ppo_agent, eval_steps):
# Evaluate agents
null_agent = NullAgent(env)
print('Evaluating Null agent...')
null_returns = evaluate(null_agent, n_episodes=eval_steps)
print('Done')
print('Evaluating PPO agent...')
ppo_returns = evaluate(ppo_agent, n_episodes=eval_steps)
print('Done')
expert_agent = ExpertAgent(env)
print('Evaluating Expert agent...')
expert_returns = evaluate(expert_agent, n_episodes=eval_steps)
print('Done')
# Display results
labels = ['null', 'ppo', 'expert']
returns = [null_returns, ppo_returns, expert_returns]
# Create DataFrame from returns
data_dict = {label: data for label, data in zip(labels, returns)}
df = pd.DataFrame(data_dict)
# Create boxplot using Seaborn
ax = sns.boxplot(data=df)
ax.set_xlabel('Policy')
ax.set_ylabel('Evaluation Output')
plt.savefig('reports/figures/results_sb3.pdf') # Save the boxplot as a PDF
plt.close() # Close the plot to prevent it from being displayed
# Convert Seaborn boxplot to Plotly
fig = go.Figure()
for i, label in enumerate(labels):
fig.add_trace(go.Box(y=df[label], name=label))
# Log Plotly figure
run.log({'rewards distributions': fig})
# Log other metrics
run.log({
'carbon_emission': tracker.flush(),
'project_type': 'lab',
'Project': 'B06-S04',
'techno_type': 'DS',
'techno': 'DRL',
'phase': 'training',
})
run.finish()
codecarbon_run = wandb.init(entity='aqsone-lab', project='CodeCarbon')
codecarbon_run.log({
'carbon_emission': tracker.flush(),
'project_type': 'lab',
'Project': 'B06-S04',
'techno_type': 'DS',
'techno': 'DRL',
'phase': 'training',
})
codecarbon_run.finish()
def init_folders():
# create models/runs/ppo
os.makedirs(os.path.join('models', 'runs'), exist_ok=True)
os.makedirs(os.path.join('models', 'runs', 'ppo'), exist_ok=True)
# create reports/figures
os.makedirs('reports', exist_ok=True)
os.makedirs(os.path.join('reports', 'figures'), exist_ok=True)
if __name__ == '__main__':
print(gym_dssat_pdi)
main()
import gym
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG)
class GymDssatWrapper(gym.Wrapper):
def __init__(self, env):
super(GymDssatWrapper, self).__init__(env)
# Assuming 'anfer' is a single-dimensional action space
self.action_low, self.action_high = self.get_action_space_bounds()
# Normalized action space to [-1, 1]
self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
# Assuming the environment is reset here to fetch the initial observation's shape after transformation
obs_shape = self.transform_observation(env.reset()).shape
self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
env.reset() # Reset the environment again after initial observation to ensure proper start
def get_action_space_bounds(self):
box = self.env.action_space['anfer']
return box.low, box.high
def transform_observation(self, observation):
# Assuming observation is a dictionary and needs to be flattened into a numeric array
# Adapt this example based on the actual structure of your observations
if isinstance(observation, dict):
transformed_observation = np.concatenate([np.array(v).flatten() for v in observation.values()])
else:
transformed_observation = np.array(observation)
return transformed_observation
def normalize_action(self, action):
normalized_action = ((action + 1) / 2) * (self.action_high - self.action_low) + self.action_low
return float(normalized_action) # Ensure the output is a scalar float
def step(self, action):
action = self.normalize_action(action)
obs, reward, done, info = self.env.step({'anfer': action})
transformed_obs = self.transform_observation(obs)
return transformed_obs, reward, done, info
def reset(self, **kwargs):
initial_obs = self.env.reset(**kwargs)
return self.transform_observation(initial_obs)
def seed(self, seed=None):
self.env.seed(seed)
Relevant log output / Error message
Traceback (most recent call last):
File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 215, in <module>
main()
File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 43, in main
ppo_agent = train_agent(
File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/train_model.py", line 128, in train_agent
ppo_agent.learn(total_timesteps=train_steps)
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/ppo/ppo.py", line 310, in learn
return super().learn(
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 247, in learn
continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 175, in collect_rollouts
new_obs, rewards, dones, infos = env.step(clipped_actions)
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_vec_env.py", line 162, in step
return self.step_wait()
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/vec_env/dummy_vec_env.py", line 43, in step_wait
obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] = self.envs[env_idx].step(
File "/home/louisreberga/LAB6-agridrl/src/models/ppo-sb3/wrappers.py", line 42, in step
obs, reward, done, info = self.env.step({'anfer': action})
File "/opt/gym_dssat_pdi/lib/python3.8/site-packages/stable_baselines3/common/monitor.py", line 94, in step
ep_rew = sum(self.rewards)
TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'
System Info
No response
Checklist
[X] My issue does not relate to a custom gym environment. (Use the custom gym env template instead)
[X] I have checked that there is no similar issue in the repo
🐛 Bug
During the training of a PPO agent using the Stable Baselines3 library in conjunction with a custom Gym environment wrapper (GymDssatWrapper) for the DSSAT agriculture simulation, an error occurs at the point of stepping through the environment (env.step(action)). Despite confirming that the reward values returned from the environment are never None (they are always a float and never 0), the following TypeError is encountered: TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'.
This issue arises after implementing a custom wrapper to normalize actions and observations for the DSSAT environment. The bug manifests during the learn method execution of the PPO algorithm, specifically when accumulating rewards within the Monitor wrapper from Stable Baselines3, suggesting that somehow a None type reward is being processed, contradicting the verified behavior of the environment where rewards are confirmed to be valid float values.
The traceback indicates the error occurs deep within the Stable Baselines3 collect_rollouts method, when actions are applied to the environment and rewards are collected. This suggests that the issue may lie in the interaction between the custom Gym wrapper and the monitoring or vectorization utilities provided by Stable Baselines3, potentially in how rewards are handled or reported back through these layers.
Further investigation and clarification are needed to identify the exact cause of this discrepancy and resolve the TypeError, ensuring stable and correct training behavior for the PPO agent within this custom DSSAT environment setup.
To Reproduce
Relevant log output / Error message
System Info
No response
Checklist