Updating multi agent training script

AvisP commented 1 year ago

os: mac os x grid2op: dev_multiagent ray[rllib]: 2.6.3

So I have been trying to get the first script of ray_multiagent.py operational with latest version of ray library. I have made some changes and I am having some issues with the line where the agent is created with the command agent = PPO(**{"config":config, "env":SELECT_ENV(env_for_cls)}). I see that the new wrapper class is initializing properly using ray_ma_env = MAEnvWrapper(env_for_cls) but the agent command is giving an error

Exception has occurred: ValueError       (note: full exception trace is shown but execution is paused at: _run_module_as_main)
<__main__.MAEnvWrapper object at 0x2a69cf050> is an invalid env specifier. You can specify a custom env as either a class (e.g., YourEnvCls) or a registered env id (e.g., "your_env").
  File "/...../lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py", line 2360, in _get_env_id_and_creator
    raise ValueError(
  File "/...../lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py", line 443, in __init__
    self._env_id, self.env_creator = self._get_env_id_and_creator(
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/...../RLLIB_MultiAgent/train_multi_agent.py", line 196, in <module>
    agent = PPO(**{"config":config,
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.4_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/opt/homebrew/Cellar/python@3.11/3.11.4_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/runpy.py", line 198, in _run_module_as_main (Current frame)
    return _run_code(code, main_globals, None,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: <__main__.MAEnvWrapper object at 0x2a69cf050> is an invalid env specifier. You can specify a custom env as either a class (e.g., YourEnvCls) or a registered env id (e.g., "your_env").

The updated script is

"""example with centralized observation and local actions"""
import warnings
import numpy as np

from gymnasium.spaces import Discrete, Box

from grid2op.multi_agent.multiAgentEnv import MultiAgentEnv as MAEnv
from ray.rllib.policy.policy import PolicySpec, Policy

import grid2op
from grid2op.Action.PlayableAction import PlayableAction
from grid2op.multi_agent.multiAgentEnv import MultiAgentEnv
from grid2op.gym_compat import GymEnv, BoxGymObsSpace, DiscreteActSpace

from lightsim2grid import LightSimBackend

ENV_NAME = "l2rpn_case14_sandbox"
DO_NOTHING_EPISODES = -1  # 200

ACTION_DOMAINS = {
        'agent_0' : [0, 1, 2, 3, 4],
        'agent_1' : [5, 6, 7, 8, 9, 10, 11, 12, 13]
    }

env_for_cls = grid2op.make(ENV_NAME,
                           action_class=PlayableAction,
                           backend=LightSimBackend())
# ma_env_for_cls = MultiAgentEnv(env_for_cls, ACTION_DOMAINS)

# wrapper for gym env
class MAEnvWrapper(MAEnv):
    def __init__(self, env, env_config=None):
        super().__init__(env, ACTION_DOMAINS)

        # env = grid2op.make(ENV_NAME,
        #                    action_class=PlayableAction,
        #                    backend=LightSimBackend())  

        self.ma_env = MultiAgentEnv(env, ACTION_DOMAINS)
        self._agent_ids = set(self.ma_env.agents)
        self.ma_env.seed(0)
        self._agent_ids = self.ma_env.agents

        # see the grid2op doc on how to customize the observation space
        # with the grid2op / gym interface.
        self._gym_env = GymEnv(env)
        self._gym_env.observation_space.close()
        self._gym_env.observation_space = BoxGymObsSpace(env.observation_space,
                                                         attr_to_keep=["gen_p",
                                                                       "rho"],
                                                         replace_nan_by_0=True  # replace Nan by 0.
                                                         )

        # we did not experiment yet with the "partially observable" setting
        # so for now we suppose all agents see the same observation
        # which is the full grid                                
        self.observation_space = Box(shape=self._gym_env.observation_space.shape,
                                     high=self._gym_env.observation_space.high,
                                     low=self._gym_env.observation_space.low,
                                     dtype=np.float32
                                     )

        # we represent the action as discrete action for now. 
        # It should work to encode then differently using the 
        # gym_compat module for example
        self._conv_action_space = {
            agent_id : DiscreteActSpace(self.ma_env.action_spaces[agent_id])
            for agent_id in self.ma_env.agents
        }

        # to avoid "weird" pickle issues
        self.action_space = {
            agent_id : Discrete(n=self.ma_env.action_spaces[agent_id].n)
            for agent_id in self.ma_env.agents
        }

    def reset(self):
        # reset the underlying multi agent environment
        obs = self.ma_env.reset()

        return self._format_obs(obs)

    def _format_obs(self, grid2op_obs):
        # NB we heavily use here that all agents see the same things 
        # grid2op_obs is a dictionnary, representing a "multi agent grid2op action"

        # convert the observation to a gym one (remember we suppose all agents see
        # all the grid)
        gym_obs = self._gym_env.observation_space.to_gym(grid2op_obs[next(iter(self.ma_env.agents))])

        # return the proper dictionnary
        return {
            agent_id : gym_obs.copy()
            for agent_id in self.ma_env.agents
        }

    def step(self, actions):       
        # convert the action to grid2op
        if actions:
            grid2op_act = {
                agent_id : self._conv_action_space[agent_id].from_gym(actions[agent_id])
                for agent_id in self.ma_env.agents
            }
        else:
            grid2op_act = {
                agent_id : self._conv_action_space[agent_id].from_gym(0)
                for agent_id in self.ma_env.agents
            }

        # just to retrieve the first agent id...
        first_agent_id = next(iter(self.ma_env.agents))

        # do a step in the underlying multi agent environment
        obs, r, done, info = self.ma_env.step(grid2op_act)

        # all agents have the same flag "done"
        done['__all__'] = done[first_agent_id]

        # now retrieve the observation in the proper form
        gym_obs =  self._format_obs(obs)

        # ignored for now
        info = {}
        return gym_obs, r, done, info

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    return agent_id

if __name__ == "__main__":
    import ray
    from ray.rllib.algorithms.ppo import PPO, PPOConfig
    import json
    import os
    import shutil

    ray_ma_env = MAEnvWrapper(env_for_cls)

    checkpoint_root = "./ma_ppo_test"

    # Where checkpoints are written:
    shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)

    # Where some data will be written and used by Tensorboard below:
    ray_results = f'{os.getenv("HOME")}/ray_results/'
    shutil.rmtree(ray_results, ignore_errors=True, onerror=None)

    info = ray.init(ignore_reinit_error=True)
    print("Dashboard URL: http://{}".format(info.address_info["webui_url"]))

    #Configs (see ray's doc for more information)
    SELECT_ENV = MAEnvWrapper                            # Specifies the OpenAI Gym environment for Cart Pole
    N_ITER = 1000                                     # Number of training runs.

    config = PPOConfig()             # PPO's default configuration. See the next code cell.
    config["log_level"] = "WARN"                    # Suppress too many messages, but try "INFO" to see what can be printed.

    # Other settings we might adjust:
    config["num_rollout_workers"] = 1                       # Use > 1 for using more CPU cores, including over a cluster
    config["num_sgd_iter"] = 10                     # Number of SGD (stochastic gradient descent) iterations per training minibatch.
                                                    # I.e., for each minibatch of data, do this many passes over it to train. 
    config["sgd_minibatch_size"] = 64              # The amount of data records per minibatch
    config["model"]["fcnet_hiddens"] = [100, 50]    #
    config["num_cpus_per_worker"] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed
    config["vf_clip_param"] = 100
    config.multi_agent(policies={
            "agent_0" : PolicySpec(
                action_space=ray_ma_env.action_space["agent_0"]
            ),
            "agent_1" : PolicySpec(
                action_space=ray_ma_env.action_space["agent_1"]
            )
            }, 
                       policy_mapping_fn = policy_mapping_fn, 
                       policies_to_train= ["agent_0", "agent_1"])

    #Trainer
    # agent = ppo.PPOTrainer(config, env=SELECT_ENV)
    agent = PPO(**{"config":config,
                   "env":SELECT_ENV(env_for_cls)})

    results = []
    episode_data = []
    episode_json = []

    for n in range(N_ITER):
        result = agent.train()
        results.append(result)

        episode = {'n': n, 
                   'episode_reward_min': result['episode_reward_min'], 
                   'episode_reward_mean': result['episode_reward_mean'], 
                   'episode_reward_max': result['episode_reward_max'],  
                   'episode_len_mean': result['episode_len_mean']
                  }

        episode_data.append(episode)
        episode_json.append(json.dumps(episode))
        file_name = agent.save(checkpoint_root)

        print(f'{n:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}. Checkpoint saved to {file_name}')

        with open(f'{ray_results}/rewards.json', 'w') as outfile:
            json.dump(episode_json, outfile)

Additionally I had to change gym to gymnasium in following file list to make it work

gymenv
gym_obs_space
gym_space_converter
gym_act_space
scalar-attr_converter
multi_to_tuple_converter
continuous_to_discrete
box_gym_obsspace
box_gym_actspace
base_gym_attr_converter
Utils
Multidiscrete_gym_actspace
discrete_gym_actspace

Any help in resolving this would be appreciated. I would like to be able to pass in env_config separately to MAEnvWrapper which will have backend_class, reward_class, obs_attr_to_keep, act_attr_to_keep

AvisP commented 11 months ago

@BDonnot Is it possible to have a look at making it work? I needed to test out Multi agent, would really appreciate if you can spend some time on it. Thanks!

BDonnot commented 10 months ago

I modified the example script, this work with the latest ray to date on my machine (grid2op installed from source)

These are all questions totally unrelated to grid2op. I had to spend some time looking at ray (as i told you i'm not a regular user of it) to make it work.

As for the customization, this is a general python question. I suggest you look at python courses to know how to create a class that can be customized depending on some arguments when it's created.

Just put whatever you want in the "env_config" and then use whatever you put there inside MAEnvWrapper definition and use it how you want. I made some example with the backend and the action_space I think.

But honestly, if this is blocking you I really suggest you take some time with some general python courses (plenty available online). There is no difficulty involving ray or grid2op there. Just the definition of a class.

AvisP commented 9 months ago

Hi Benjamin, thanks a lot for looking into it and fixing the issue. I understand this is not in your top priority but you took time in studying ray documentation and fixing it. I verified the scripts are running properly.

I did try to follow the example of backend class you provided and with my understanding of the python classes I tried to do the following implementation. The code is almost the same as what you did in this. I separated out the class into a separate file that can take env_config similar to what you did for backend class.

The issue happens for the line agent = PPO(config=config, env=MAEnvWrapper) as I am not able to pass the env_config. I tried creating a class variable env_config_class inside MAEnvWrapper to store the config on the call to ray_ma_env = MAEnvWrapper(env_config) but that didn't work. Bit lost here as to what to do.

One other aspect with this implementation is it executes the environment creation command env = grid2op.make(ENV_NAME, action_class=PlayableAction, backend=backend) twice. Although this is fine for smaller environment but for bigger ones it takes up a lot of memory. And we need to do this just to get the observation_space and action_space.

Python class file names multiagentgrid2op.py

# wrapper for gym env
from grid2op.Action import PlayableAction
import grid2op
from grid2op.gym_compat import GymEnv, BoxGymObsSpace, DiscreteActSpace
from ray.rllib.env.multi_agent_env import MultiAgentEnv as MAEnv
from grid2op.multi_agent.multiAgentEnv import MultiAgentEnv
from lightsim2grid import LightSimBackend
from gymnasium.spaces import Discrete, Box
import copy

from ray.rllib.policy.policy import PolicySpec, Policy

class MAEnvWrapper(MAEnv):
    env_config_class = {}

    def __init__(self, env_config=None):
        super().__init__()
        if env_config is None:
            env_config = {}

        # you can customize stuff by using the "env config" if you want
        backend = LightSimBackend()
        if not MAEnvWrapper.env_config_class and "backend_cls" in env_config:
            backend = env_config["backend_cls"]
        # you can do the same for other attribute to the environment

        if not MAEnvWrapper.env_config_class and "env_name" in env_config:
            ENV_NAME = env_config["env_name"]
        elif "env_name" in MAEnvWrapper.env_config_class:
            ENV_NAME = MAEnvWrapper.env_config_class["env_name"]
        if not MAEnvWrapper.env_config_class and "action_domains" in env_config:
            ACTION_DOMAINS = env_config["action_domains"]
        elif "action_domains" in MAEnvWrapper.env_config_class:
            ACTION_DOMAINS = MAEnvWrapper.env_config_class["action_domains"]

        backend = LightSimBackend()

        env = grid2op.make(ENV_NAME,
                           action_class=PlayableAction,
                           backend=backend)  

        self.ma_env = MultiAgentEnv(env, ACTION_DOMAINS)
        self._agent_ids = set(self.ma_env.agents)
        self.ma_env.seed(0)
        self._agent_ids = self.ma_env.agents

        # see the grid2op doc on how to customize the observation space
        # with the grid2op / gym interface.
        self._gym_env = GymEnv(env)
        self._gym_env.observation_space.close()

        obs_attr_to_keep = ["gen_p", "rho"]
        if "obs_attr_to_keep" in env_config:
            obs_attr_to_keep = copy.deepcopy(env_config["obs_attr_to_keep"])
        self._gym_env.observation_space = BoxGymObsSpace(env.observation_space,
                                                         attr_to_keep=obs_attr_to_keep,
                                                         replace_nan_by_0=True  # replace Nan by 0.
                                                         )

        # we did not experiment yet with the "partially observable" setting
        # so for now we suppose all agents see the same observation
        # which is the full grid                                
        self._aux_observation_space = {
            agent_id : BoxGymObsSpace(self.ma_env.observation_spaces[agent_id],
                                      attr_to_keep=obs_attr_to_keep,
                                      replace_nan_by_0=True  # replace Nan by 0.
                                      )
            for agent_id in self.ma_env.agents
        }
        # to avoid "weird" pickle issues
        self.observation_space = {
            agent_id : Box(low=self._aux_observation_space[agent_id].low,
                           high=self._aux_observation_space[agent_id].high,
                           dtype=self._aux_observation_space[agent_id].dtype)
            for agent_id in self.ma_env.agents
        }

        # we represent the action as discrete action for now. 
        # It should work to encode then differently using the 
        # gym_compat module for example
        act_type = "discrete"
        if "act_type" in env_config:
            act_type = env_config["act_type"]

        # for discrete actions
        if act_type == "discrete":
            self._conv_action_space = {
                agent_id : DiscreteActSpace(self.ma_env.action_spaces[agent_id])
                for agent_id in self.ma_env.agents
            }

            # to avoid "weird" pickle issues
            self.action_space = {
                agent_id : Discrete(n=self.ma_env.action_spaces[agent_id].n)
                for agent_id in self.ma_env.agents
            }
        else:
            raise NotImplementedError("Make the implementation in this case")

    def reset(self, *, seed=None, options=None):
        if seed is not None:
            self.seed(seed)

        # reset the underlying multi agent environment
        obs = self.ma_env.reset()

        return self._format_obs(obs), {}

    def seed(self, seed):
        return self.ma_env.seed(seed)

    def _format_obs(self, grid2op_obs):
        # NB we heavily use here that all agents see the same things 
        # grid2op_obs is a dictionnary, representing a "multi agent grid2op action"

        # convert the observation to a gym one (remember we suppose all agents see
        # all the grid)
        gym_obs = self._gym_env.observation_space.to_gym(grid2op_obs[next(iter(self.ma_env.agents))])

        # return the proper dictionnary
        return {
            agent_id : gym_obs.copy()
            for agent_id in self.ma_env.agents
        }

    def step(self, actions):       
        # convert the action to grid2op
        if actions:
            grid2op_act = {
                agent_id : self._conv_action_space[agent_id].from_gym(actions[agent_id])
                for agent_id in self.ma_env.agents
            }
        else:
            grid2op_act = {
                agent_id : self._conv_action_space[agent_id].from_gym(0)
                for agent_id in self.ma_env.agents
            }

        # just to retrieve the first agent id...
        first_agent_id = next(iter(self.ma_env.agents))

        # do a step in the underlying multi agent environment
        obs, r, done, info = self.ma_env.step(grid2op_act)

        # all agents have the same flag "done"
        done['__all__'] = done[first_agent_id]

        # now retrieve the observation in the proper form
        gym_obs =  self._format_obs(obs)

        # ignored for now
        info = {}
        truncateds = {k: False for k in self.ma_env.agents}
        truncateds['__all__'] = truncateds[first_agent_id]
        return gym_obs, r, done, truncateds, info

Training script

import warnings
import numpy as np
import copy
import grid2op
from multiagentgrid2op import MAEnvWrapper
from ray.rllib.policy.policy import PolicySpec, Policy

ENV_NAME = "l2rpn_case14_sandbox"
DO_NOTHING_EPISODES = -1  # 200

ACTION_DOMAINS = {
        'agent_0' : [0, 1, 2, 3, 4],
        'agent_1' : [5, 6, 7, 8, 9, 10, 11, 12, 13]
    }

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    return agent_id

if __name__ == "__main__":
    import ray
    # from ray.rllib.agents.ppo import ppo
    from ray.rllib.algorithms.ppo import PPO, PPOConfig
    import json
    import os
    import shutil

    env_config = {"env_name":ENV_NAME,
                  "action_domains":ACTION_DOMAINS}
    ray_ma_env = MAEnvWrapper(env_config)
    # ray_ma_env.create_env(env_config)

    checkpoint_root = "./ma_ppo_test"

    # Where checkpoints are written:
    shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)

    # Where some data will be written and used by Tensorboard below:
    ray_results = f'{os.getenv("HOME")}/ray_results/'
    shutil.rmtree(ray_results, ignore_errors=True, onerror=None)

    info = ray.init(ignore_reinit_error=True)
    print("Dashboard URL: http://{}".format(info.address_info["webui_url"]))

    #Configs (see ray's doc for more information)
    SELECT_ENV = MAEnvWrapper                       # Specifies the OpenAI Gym environment for Cart Pole
    N_ITER = 5#1000                                     # Number of training runs.
    MAEnvWrapper.env_config_class = env_config
    # see ray doc for this...
    # syntax changes every ray major version apparently...
    config = PPOConfig()
    config = config.training(gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size=128)
    config = config.resources(num_gpus=0)
    config = config.rollouts(num_rollout_workers=1)

    # multi agent parts
    config.multi_agent(policies={
        "agent_0" : PolicySpec(
            action_space=ray_ma_env.action_space["agent_0"],
            observation_space=ray_ma_env.observation_space["agent_0"]
        ),
        "agent_1" : PolicySpec(
            action_space=ray_ma_env.action_space["agent_1"],
            observation_space=ray_ma_env.observation_space["agent_1"],
        )
        }, 
                    policy_mapping_fn = policy_mapping_fn, 
                    policies_to_train= ["agent_0", "agent_1"])

    #Trainer
    agent = PPO(config=config, env=MAEnvWrapper(env_config))

    results = []
    episode_data = []
    episode_json = []

    for n in range(N_ITER):
        result = agent.train()
        results.append(result)

        episode = {'n': n, 
                   'episode_reward_min': result['episode_reward_min'], 
                   'episode_reward_mean': result['episode_reward_mean'], 
                   'episode_reward_max': result['episode_reward_max'],  
                   'episode_len_mean': result['episode_len_mean']
                  }

        episode_data.append(episode)
        episode_json.append(json.dumps(episode))
        file_name = agent.save(checkpoint_root)

        print(f'{n:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}. Checkpoint saved to {file_name}')

        with open(f'{ray_results}/rewards.json', 'w') as outfile:
            json.dump(episode_json, outfile)

As these pythonic issues are not related to grid2ops in any way so if you can't respond, it's fine and I will continue working with the script.

BDonnot commented 9 months ago

Hello,

Just so that you can start to look for the answer yourself (or anyone having some knowledge with python, or find the answer in almost any python course dealing with "OOP" or classes). When you do that:

class MAEnvWrapper(MAEnv):
    env_config_class = {}

    def __init__(self, env_config=None):
       ...

You tell python that there is a class called MAEnvWrapper. An object of this class should be created like this:

   maenv = MAEnvWrapper()  # in this case `env_config` is None, that's the default behaviour
OR
   whatever_its_name = {a dictionnary representing the config of your MAENV}
   maenv = MAEnvWrapper(whatever_its_name)
   or
   maenv = MAEnvWrapper(env_config=whatever_its_name)

Any other way to create a "MAEnvWrapper" will fail with probably a "TypeError"

In your second script, you create an object called env that is a "MAEnvWrapper" where you do this

 agent = PPO(config=config, env=MAEnvWrapper(env_config))

So you are using the second way above with


whatever_its_name = {"env_name":ENV_NAME,
                                        "action_domains":ACTION_DOMAINS}
# NB in your code you called this configuration dictionary `env_config`
... # bunch of unrelated code
 agent = PPO(config=config, env=MAEnvWrapper(env_config=whatever_its_name))
... # another bunch of unrelated code

So if you want to customize your environment, you should modify the variable that you called env_config (I called it whatever_its_name) and only this variable.

I really suggest you take some python courses. These are some really common python syntax. If you don't know what any of this does, why it does this like that etc. it's unlikely that you'll be able to train a PPO on a multi agent environment.

For example, why you use a "env_config_class" ? This makes no sense here. Just pass the information that you want to pass to your agent in the "env_config" dictionary.

For example (leaving out all the unrelated code):

class MAEnvWrapper(MAEnv):
    def __init__(self, env_config=None):
    # say you want to configure: type of backend, name of the environment, 
    # so you can have:
    # whatever_its_name = {"backend_cls" : ... , # type of backend
    #                      "env_name" : ... , # name of the environment
    #                      "act_type": ..., # action type (discrete, box etc.)
    #                      "act_attr_to_keep": ...,
    #                      "obs_type": ...,
    #                      "obs_attr_to_keep": ...,
    #                      "act_domain": ...,
    #                      "obs_domain": ...
    #                       }
        super().__init__()
        if env_config is None:
            env_config = {}

        # handle the backend
        if "backend_cls" in env_config:
            # user specified a type of backend to use
            backend = env_config["backend_cls"]()
        else:
            backend = LightSimBackend()

        # handle env_name
        if "env_name" in env_config:
            # user specified the name of the environment
            env_name = str(env_config["env_name"])
        else:
            # user did not specify anything, use the default name stored in the variable ENV_NAME
            env_name = ENV_NAME

        # create the grid2op environment
        env = grid2op.make(env_name,  # use the `env_name` variable created above
                           action_class=PlayableAction,  # use this otherwise it will not work
                           backend=backend  # use the `backend` variable created above
                           )    

        # now let's deal with the multi agent grid2op environment   
        if "act_domain" in env_config:
            # user wanted to modify the action domain
            act_domain = copy.deepcopy(env_config["act_domain"])
        else:
            # user did not specify anything in this case => use the default
            # in this script the default value is stored in the ACTION_DOMAINS variable
            act_domain = copy.deepcopy(ACTION_DOMAINS)

        if "obs_domain" in env_config:
            # user wanted to modify the action domain
            obs_domain = copy.deepcopy(env_config["obs_domain"])
        else:
            # user did not specify anything in this case => use the default
            # in this script the default value is stored in the OBSERVATION_DOMAINS variable
            obs_domain = copy.deepcopy(OBSERVATION_DOMAINS)

        ## now create the multi agent grid2op environment
        self.ma_env = MultiAgentEnv(env,
                                    action_domains=act_domain,
                                    observation_domains=obs_domain)

        ... # lots of unrelated code

        # create the observation space
        ## first let's read the configuration
        if "obs_attr_to_keep" in env_config:
            # user passed a custom configuration
            obs_attr_to_keep = copy.deepcopy(env_config["obs_attr_to_keep"])
        else:
            # user did not want to modify the "default" value
            obs_attr_to_keep = ["gen_p", "rho"]

        if "obs_type" in env_config:
            # user wanted a specific type of observation space (eg. Box or Dict or whatever)
            obs_type = env_config["obs_type"]
        else:
            # use the default value as the user did not specify it in its config
            obs_type = "box" 

        ## now let's create the observation space
        if obs_type == "box":
            self._aux_observation_space = {
            agent_id : BoxGymObsSpace(self.ma_env.observation_spaces[agent_id],
                                      attr_to_keep=obs_attr_to_keep,
                                      replace_nan_by_0=True  # replace Nan by 0.
                                      )
            for agent_id in self.ma_env.agents
        }

            # to avoid "weird" pickle issues
            self.observation_space = {
                agent_id : Box(low=self._aux_observation_space[agent_id].low,
                            high=self._aux_observation_space[agent_id].high,
                            dtype=self._aux_observation_space[agent_id].dtype)
                for agent_id in self.ma_env.agents
            }
        else:
            raise NotImplementedError(f"Observation space of type {obs_type} is not implemented yet")
        # when the code reaches here, you should have defined self._aux_observation_space and self.observation_space

        ## now let's create the action space in the same fashion
        if "act_attr_to_keep" in env_config:
            # user passed a custom configuration
            act_attr_to_keep = copy.deepcopy(env_config["act_attr_to_keep"])
        else:
            # user did not want to modify the "default" value
            act_attr_to_keep = ["set_bus"]

        if "act_type" in env_config:
            # user wanted a specific type of observation space (eg. Box or Dict or whatever)
            act_type = env_config["act_type"]
        else:
            # use the default value as the user did not specify it in its config
            act_type = "discrete" 

        # for discrete actions
        if act_type == "discrete":
            self._aux_action_space = {
                agent_id : DiscreteActSpace(self.ma_env.action_spaces[agent_id], attr_to_keep=act_attr_to_keep)
                for agent_id in self.ma_env.agents
            }

            # to avoid "weird" pickle issues
            self.action_space = {
                agent_id : Discrete(n=self._aux_action_space[agent_id].n)
                for agent_id in self.ma_env.agents
            }
        else:
            raise NotImplementedError(f"Action space of type {act_type} is not implemented yet")     
        # when the code reaches here, you should have defined self._aux_action_space and self.action_space

    def _format_obs(self, grid2op_obs):
        # this function converts a multi-agent grid2op observation into the
        # gym observation for each agent
        return {
            agent_id : self._aux_observation_space[agent_id].to_gym(grid2op_obs[agent_id])
            for agent_id in self.ma_env.agents
        }

    def step(self, actions):       
        # convert the actions: one action for each agent, each action
        # being represented as a gym structure
        # to valid grid2op action
        # then performs a step in the multi-agent grid2op environment
        # and converts back the observation (represented first as grid2op observation)
        # to gym observation

        # 1. converts the actions
        if actions:
            grid2op_act = {
                agent_id : self._aux_action_space[agent_id].from_gym(actions[agent_id])
                for agent_id in self.ma_env.agents
            }
        else:
            grid2op_act = {
                agent_id : self._aux_action_space[agent_id].from_gym(0)
                for agent_id in self.ma_env.agents
            }

        # just to retrieve the first agent id...
        first_agent_id = next(iter(self.ma_env.agents))

        # do a step in the underlying multi agent environment
        obs, r, done, info = self.ma_env.step(grid2op_act)

        # all agents have the same flag "done"
        done['__all__'] = done[first_agent_id]

        # now retrieve the observation in the proper form (shaped like a multi agent gym)
        gym_obs =  self._format_obs(obs)

        # ignored for now (unrelated code)
        info = {}
        truncateds = {k: False for k in self.ma_env.agents}
        truncateds['__all__'] = truncateds[first_agent_id]
        return gym_obs, r, done, truncateds, info

    # of course you need to implement the other functions

And now once you have that, you can customize your "MultiAgentEnv" (for ray) with:

whatever_its_name = {"backend_cls" : ... , # type of backend
                     "env_name" : ... , # name of the environment
                     "act_type": ..., # action type (discrete, box etc.)
                     "act_attr_to_keep": ...,
                     "obs_type": ...,
                     "obs_attr_to_keep": ...,
                     "act_domain": ...,
                     "obs_domain": ...
                      }

All keys above being optional So this works too

# keep everything as default
whatever_its_name = {}  
OR if you just want to customize the class of backend and use PandaPowerBackend (don't do this !)
whatever_its_name = {"backend_cls" : PandaPowerBackend}
OR if you want to change the environment name and what is observed by the agent
whatever_its_name = {"env_name" : "l2rpn_case14_sandbox",
                     "obs_attr_to_keep": ["rho"]}
OR if you want to change env name and the action / observation domains
whatever_its_name = {"env_name" : "l2rpn_case14_sandbox",
                     "act_domain": {
                         'agent_0' : [0, 2, 4, 6, 8],
                         'agent_1' : [1, 3, 5, 7, 9],
                         'agent_2': [10, 11, 12, 13]
                         },
                     "obs_domain": {
                         'agent_0' : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'agent_1' : [2, 3, 8, 9, 10, 11, 12, 13],
                         'agent_2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
                         }
                     }

I will not do the billions (literally) configuration here, but I hope you get the idea.

And then later in the code, you create the environment like this:

env_for_ppo_rllib = MAEnvWrapper(env_config=whatever_its_name))
agent = PPO(config=config, env=env_for_ppo_rllib)

If at this stage you can't:

add some more customization to env_config (for example add more customization to the observation space, eg to normalize the data)
modify the configuration when you create an environment
fill-in the ... I purposedly let in the code
Add some more implementation for some other type of action_space (eg add the possibility to retrieve Box or MultiDiscrete gymnasium act space for your agent)

I more than strongly suggest you take a few weeks to learn object oriented programming in python or ask someone with more knowledge of this computer language than you how you can do that.

I can't do more than this very detailed "sketch of implementation" unfortunately.

Grid2op / grid2op

Updating multi agent training script #535