DLR-RM / stable-baselines3

PyTorch version of Stable Baselines, reliable implementations of reinforcement learning algorithms.
https://stable-baselines3.readthedocs.io
MIT License
8.98k stars 1.68k forks source link

[Question] Magnitude of action values suddently change after message from stable-baselines3 #1820

Closed PBerit closed 8 months ago

PBerit commented 8 months ago

❓ Question

Hi all,

I have the following code of a gymnasium environment in combination with stable-baselines 3:

import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Tuple, MultiDiscrete

import numpy as np
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3 import A2C
import stable_baselines3 as sb3

import os

#Define parameters of the battery model BYD B-BOX 10.0
battery_capacity = 10      # Unit: [kWh]
charing_efficiency = 95   # Unit: [%]
maximum_charging_power = 10000 #Unit: [W]
pv_peak_power = 2.5 #Unit: [kW]
time_resolution = 15 * 60  #Unit: [s]

print_information_during_training = True

class RL_Env(Env):
    def __init__(self):
        import pandas as pd
        import numpy as np

        # Read the CSV file into a DataFrame
        file_path = 'Data_Training_Exercise_6_ESHL_May_2020.csv'  # Replace with the actual file path
        df = pd.read_csv(file_path, sep=';')

        # Extract the values from the DataFrame into an array with the dimensionality (31,96) for the second (pv_generation) and third column (electricity consumption)
        self.pv_generation_data = df.iloc[:, 1].values.reshape((31, 96))
        self.electricity_consumption_data = df.iloc[:, 2].values.reshape((31, 96))

        # Create a Box for the action space
        self.action_space = gym.spaces.Box(low=-1 * maximum_charging_power, high=maximum_charging_power, shape=(1,))

        # Define observation space
        low = np.array([0, 0, 0], dtype=np.float64)
        high = np.array([3500, 3500, 1], dtype=np.float64)
        self.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float64)

        self.battery_state_of_charge = 0

        self.index_current_day = 0
        self.index_current_time_slot_of_the_day = 0

    #Reset the environment
    def reset(self, **kwargs):

        #Call the super method
        super().reset(**kwargs)

        self.battery_state_of_charge = 0

        #Choose a random day from the training data
        random_integer_day_index = self.np_random.integers(0, 20)
        self.index_current_day = random_integer_day_index

        #Resert the index for the time slot counter of the day
        self.index_current_time_slot_of_the_week = 0

        observation = np.array([self.electricity_consumption_data [self.index_current_day, self.index_current_time_slot_of_the_day], self.electricity_consumption_data [self.index_current_day, self.index_current_time_slot_of_the_day], self.battery_state_of_charge])

        info =  {}

        return observation, info

    def render(self):
        pass

    def step(self, action):

        # Execute the action
        action_battery_charging= action[0]
        help_value_battery_charging_before_adjustment = action_battery_charging

        #Adjust the action due to technical constraint: not enough energy in the battery for discharging with the choosen action
        if action_battery_charging * time_resolution < ((-1) * self.battery_state_of_charge * battery_capacity):
            action_battery_charging = ((-1) * self.battery_state_of_charge * self.battery_state_of_charge * battery_capacity)/time_resolution

        # Adjust the action due to technical constraint: not enough pv generated for charging with the choosen action
        if action_battery_charging > self.pv_generation_data [self.index_current_day, self.index_current_time_slot_of_the_day]:
            action_battery_charging = self.pv_generation_data [self.index_current_day, self.index_current_time_slot_of_the_day]

        self.battery_state_of_charge = self.battery_state_of_charge  + (action_battery_charging * time_resolution * charing_efficiency ) / (battery_capacity*3600000)

        energy_balance =  self.electricity_consumption_data [self.index_current_day, self.index_current_time_slot_of_the_day] + action_battery_charging - self.pv_generation_data [self.index_current_day, self.index_current_time_slot_of_the_day]

        required_power_from_the_grid = energy_balance

        if required_power_from_the_grid < 0:
            required_power_from_the_grid = 0

        # calculate reward
        reward =0
        if energy_balance < 0:
            reward = (-1*energy_balance)
        if energy_balance >=0:
            reward = energy_balance

        #Define the observation
        observation = np.array([self.electricity_consumption_data [self.index_current_day, self.index_current_time_slot_of_the_day], self.pv_generation_data [self.index_current_day, self.index_current_time_slot_of_the_day], self.battery_state_of_charge])

        #Update index counters
        self.index_current_time_slot_of_the_day = self.index_current_time_slot_of_the_day + 1

        #Check end of the day
        if  self.index_current_time_slot_of_the_day >= 96 - 1:
            terminated = True
            truncuated = True
            self.index_current_time_slot_of_the_day = 0
        else:
            terminated = False
            truncuated = False

        #Print information
        if print_information_during_training == True:
            print(f"index_current_time_slot_of_the_day: {self.index_current_time_slot_of_the_day}")
            print(f"index_current_day : {self.index_current_day}")
            print(f"electricity_consumption: {round(self.electricity_consumption_data [self.index_current_day, self.index_current_time_slot_of_the_day],1)}")
            print(f"pv_generation_data: {round(self.pv_generation_data [self.index_current_day, self.index_current_time_slot_of_the_day],1)}")
            print(f"battery_state_of_charge: {round(self.battery_state_of_charge,2)}")
            print(f"action_battery_charging_before_adjustment: {np.round(help_value_battery_charging_before_adjustment, 1)}")
            print(f"action_battery_charging: {np.round(action_battery_charging, 1)}")
            print(f"energy_balance: {round(energy_balance,1)}")
            print(f"reward: {round(reward,1)}")
            print("")

        info = {}

        return observation, reward, terminated,truncuated, info

#Create gymnasium environment
gym.register("battery-env-v0", lambda: RL_Env())
env = gym.make("battery-env-v0")

#Check the environment
check_environment = True
if check_environment == True:
    #from gymnasium.utils.env_checker import check_env
    #check_env(env.unwrapped)
    from stable_baselines3.common.env_checker import check_env
    check_env(env)

#Define the model directory (PPO, A2C, TD3, DQN)
string_run_name = "test_1"
models_dir = "Trained_RL_Models/" + string_run_name +  "_A2C"
logdir = "Trained_RL_Models/" + string_run_name +  "_A2C"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
if not os.path.exists(logdir):
    os.makedirs(logdir)

#Define the model directory (PPO, A2C, TD3, DQN)
model = A2C('MlpPolicy', env, verbose=1, learning_rate= 0.0003, ent_coef= 0.2) #Default values: ent_coef= 0.0, learning_rate= 0.0003

#train and save the model
model.learn(total_timesteps=100)
model.save(os.path.join(models_dir, 'trained_A2C_model'))

The action space ranges from -10000 to 10000. I print the output during the training and in the first few iterations the maginudes of the actions exploit the range. Then suddenly I get a strange message from stable-baselines 3:

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

And afterwards the maginude of the action value becomes extremely small (between -2 and 2) which does not make sense for my environment. Can someone explain, why stable-baselines 3 reports these messages and then suddently changes the magnitudes of the actions variable? And of course, I would like to know how to stop stable-baselines 3 from doing that.

Checklist

araffin commented 8 months ago

https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html#tips-and-tricks-when-creating-a-custom-environment -> Why should I normalize the action space?

also in https://youtu.be/Ikngt0_DXJg?si=VBohQYo0nnpBT9vP&t=780

Please use the env checker and mind its warnings, and please the "custom gym env" issue template next time.

Duplicate of https://github.com/hill-a/stable-baselines/issues/473 and others

For the behavior, you probably used SAC or another offpolicy algorithm, they use uniform sampling to warm start the replay buffer.

PBerit commented 8 months ago

@araffin : Thanks for your answer. Unfortunately, I have to admit that I don't see how it answers my question. The core of my question is why does stable-baselines3 print those messages "Using cpu device Wrapping the env with a Monitor wrapper Wrapping the env in a DummyVecEnv." and why afterwards the algorithm uses very small actions. I am using the environment checker of both stable-baselines 3 and gymansium and I don't get a real warning but just " warnings.warn("

araffin commented 8 months ago

is why does stable-baselines3 print those messages

This is at the very beginning of training, to have some info about what is happening and if PyTorch is using the GPU, you can remove them by setting verbose=0.

https://github.com/DLR-RM/stable-baselines3/blob/620e58e61f649d0f415b7796386d6fe405778026/stable_baselines3/common/base_class.py#L128-L129

and

https://github.com/DLR-RM/stable-baselines3/blob/620e58e61f649d0f415b7796386d6fe405778026/stable_baselines3/common/base_class.py#L169

d I don't get a real warning but just " warnings.warn("

Could you provide a minimal example to reproduce that? that's really all you got in the output?

(we have tests to check those warnings: https://github.com/DLR-RM/stable-baselines3/blob/620e58e61f649d0f415b7796386d6fe405778026/tests/test_envs.py#L151-L166 )

PBerit commented 8 months ago

@araffin : Thanks araffin for your answer. Actually, when reducing the magnitude of the action space to a box ranging from -1 to 1, the problem does not occur any more.

Still it is really strange that when having self.action_space = gym.spaces.Box(low=-1 * maximum_charging_power, high=maximum_charging_power, shape=(1,)) the magnitude of actions change just after stable-baselines 3 print those messages about the environment. Here you see an exemplary output:

2024-01-31 10:25:25.338805: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2024-01-31 10:25:25.338999: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
C:\Users\wi9632\Anaconda3\lib\site-packages\stable_baselines3\common\env_checker.py:441: UserWarning: We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) cf. https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html
  warnings.warn(
index_current_time_slot_of_the_day: 1
action_battery_charging_before_adjustment: 6882.5
action_battery_charging: 0.3

index_current_time_slot_of_the_day: 2
action_battery_charging_before_adjustment: -3526.800048828125
action_battery_charging: 0.0

index_current_time_slot_of_the_day: 3
action_battery_charging_before_adjustment: 4588.2998046875
action_battery_charging: 0.3

index_current_time_slot_of_the_day: 4
action_battery_charging_before_adjustment: 4089.10009765625
action_battery_charging: 0.3

index_current_time_slot_of_the_day: 5
action_battery_charging_before_adjustment: 6322.5
action_battery_charging: 0.4

index_current_time_slot_of_the_day: 6
action_battery_charging_before_adjustment: -5229.60009765625
action_battery_charging: -0.0

index_current_time_slot_of_the_day: 7
action_battery_charging_before_adjustment: 5790.2001953125
action_battery_charging: 0.4

index_current_time_slot_of_the_day: 8
action_battery_charging_before_adjustment: 965.2000122070312
action_battery_charging: 0.4

index_current_time_slot_of_the_day: 9
action_battery_charging_before_adjustment: -6037.2998046875
action_battery_charging: -0.0

index_current_time_slot_of_the_day: 10
action_battery_charging_before_adjustment: 5256.39990234375
action_battery_charging: 0.3

index_current_time_slot_of_the_day: 11
action_battery_charging_before_adjustment: 5291.0
action_battery_charging: 0.4

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
index_current_time_slot_of_the_day: 12
action_battery_charging_before_adjustment: -0.6000000238418579
action_battery_charging: 0.0

index_current_time_slot_of_the_day: 13
action_battery_charging_before_adjustment: 0.5
action_battery_charging: 0.4

index_current_time_slot_of_the_day: 14
action_battery_charging_before_adjustment: -0.10000000149011612
action_battery_charging: -0.0

index_current_time_slot_of_the_day: 15
action_battery_charging_before_adjustment: 0.800000011920929
action_battery_charging: 0.4

index_current_time_slot_of_the_day: 16
action_battery_charging_before_adjustment: -0.5
action_battery_charging: -0.0

index_current_time_slot_of_the_day: 17
action_battery_charging_before_adjustment: 0.5
action_battery_charging: 0.3

index_current_time_slot_of_the_day: 18
action_battery_charging_before_adjustment: 1.600000023841858
action_battery_charging: 0.4

But I now see, that there is in fact a full warning regarding the action space. However, I still can't explain why the magnitudes change drastically after the output of the messages.

araffin commented 8 months ago

But I now see, that there is in fact a full warning regarding the action space. However, I still can't explain why the magnitudes change drastically after the output of the messages.

There are two different things there. The env checker, which is sampling uniformly the action space and then comes the agent training (after the messages).