Issue when evaluating trained PPO_RLLIB agent

System information

Grid2op version: 1.8.1
l2rpn-baselines version: 0.6.0.post1
System: mac osx, ubuntu16.04, ...
Baseline concerned: PPO_RLLIB

Bug description

When I am evaluating trained PPO_RLLIB agent the total score for chronics is getting printed out as 0. Even if the PPO_RLLIB agent didn't get trained properly, but total_score should still be non-zero.

Output I am getting is

Evaluation summary:
chronics at: 0000       total score: 0.000000   time steps: 1091/8064
chronics at: 0001       total score: 0.000000   time steps: 807/8064
chronics at: 0002       total score: 0.000000   time steps: 3001/8064
chronics at: 0003       total score: 0.000000   time steps: 3/8064
chronics at: 0004       total score: 0.000000   time steps: 804/8064
Evaluation summary for Do Nothing Agent:
chronics at: 0000       total score: 622.306925 time steps: 1091/8064
chronics at: 0001       total score: 464.387165 time steps: 807/8064
chronics at: 0002       total score: 1759.294096        time steps: 3001/8064
chronics at: 0003       total score: 1.020729   time steps: 3/8064
chronics at: 0004       total score: 479.332989 time steps: 804/8064

How to reproduce

The training script I used

import grid2op
from grid2op.gym_compat import GymEnv, BoxGymObsSpace, BoxGymActSpace
from grid2op.Backend import PandaPowerBackend
from lightsim2grid import LightSimBackend
from l2rpn_baselines.PPO_RLLIB import PPO_RLLIB, train
from l2rpn_baselines.PPO_RLLIB.rllibagent import RLLIBAgent
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from grid2op.Chronics import MultifolderWithCache  # highly recommended
import copy
import re
import ray

env_name = "l2rpn_case14_sandbox"  # or any other name
obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
                    "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
                    "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
                    "storage_power", "storage_charge"]
act_attr_to_keep = ["change_line_status", "change_bus", "redispatch"]

env = grid2op.make(env_name, backend=LightSimBackend())

ray.init()

train(env,
        iterations=100,  # any number of iterations you want
        learning_rate=1e-4, # set learning rate
        save_path="./saved_model/PPO_RLLIB3",  # where the NN weights will be saved
        # load_path="./saved_model/PPO_RLLIB/test", # resuming from previous saved training
        name="test",  # name of the baseline
        net_arch=[100, 100, 100],  # architecture of the NN
        save_every_xxx_steps=10,  # save the NN every 2 training steps
        env_kwargs={"reward_class": LinesCapacityReward,
                    "chronics_class": MultifolderWithCache,  # highly recommended
                    "data_feeding_kwargs": {
                        'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
                        }
        },
        obs_attr_to_keep=copy.deepcopy(obs_attr_to_keep),
        act_attr_to_keep=copy.deepcopy(act_attr_to_keep),
        verbose=True)

env.close()
ray.shutdown()

The evaluation script used

import grid2op
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from lightsim2grid import LightSimBackend  # highly recommended !
from l2rpn_baselines.PPO_RLLIB import evaluate
from grid2op.Runner import Runner

nb_episode = 5
nb_process = 1
verbose = True
env_name = "l2rpn_case14_sandbox"
env = grid2op.make(env_name,
                reward_class=LinesCapacityReward,
                backend=LightSimBackend()
                )
try:
    evaluate(env,
            nb_episode=nb_episode,
            load_path="./saved_model/PPO_RLLIB3",  # should be the same as what has been called in the train function !
            name="test",  # should be the same as what has been called in the train function !
            logs_path = "./logs/PPO_RLLIB3/",
            nb_process=1,
            verbose=verbose,
            )
    # you can also compare your agent with the do nothing agent relatively
    # easily
    runner_params = env.get_params_for_runner()
    runner = Runner(**runner_params)
    res = runner.run(nb_episode=nb_episode,
                    nb_process=nb_process
                    )
    # Print summary
    if verbose:
        print("Evaluation summary for Do Nothing Agent:")
        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
            msg_tmp = "chronics at: {}".format(chron_name)
            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
            print(msg_tmp)
finally:
    env.close()

rte-france / l2rpn-baselines

Issue when evaluating trained PPO_RLLIB agent #47

System information

Bug description

How to reproduce