hill-a / stable-baselines

A fork of OpenAI Baselines, implementations of reinforcement learning algorithms
http://stable-baselines.readthedocs.io/
MIT License
4.16k stars 725 forks source link

#question _on_step method in custom callback #1179

Open vrige opened 1 year ago

vrige commented 1 year ago

question about _on_step method in custom callback using also a custom wrapper

I am new to stable_baselines and I was wondering if it is normal that the on_step method of a callback start after reset of the env. Isn't it suppose to start after the step method of the env (or the wrapper in my case)? I know how I may avoid the issue, but I was wondering if there is a way to call the callback function exactly after the step method of the env. The env is the custom env from the tutorial (snake env). The following code should show that the callback print always episode_length and episode_return equal to zero because the env is resetted.

class WrapperStatistics(gym.Wrapper):
    def __init__(self, env: gym.Env, size: int = 250, verbose: int = 0):
        super(WrapperStatistics, self).__init__(env)
        self.verbose = verbose
        self.episode_count = 0
        self.steps_count = 0
        self.episode_rewards = np.empty(size, dtype=float)
        self.episodes_rewards = []
        self.episode_return = 0
        self.episodes_returns = np.empty(0, dtype=float)
        self.episode_length = 0
        self.episodes_lengths = np.empty(0, dtype=float)

    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        self.episode_rewards = np.empty(self.episode_rewards.size, dtype=float)
        self.episode_length = 0
        self.episode_return = 0
        print("I am resetting")
        return obs

    def step(self, action):
        obs, reward, done, info = self.env.step(action)

        self.episode_length += 1
        self.steps_count += 1

        if self.episode_length == self.episode_rewards.size:
            tmp = np.empty(self.episode_rewards.size, dtype=float)
            self.episode_rewards = np.concatenate((self.episode_rewards, tmp), axis=None)

        self.episode_rewards[self.episode_length] = reward
        self.episode_return += reward

        if done:
            if self.verbose != 0:
                print('Episode: {}, len episode: {}, return episode: {}'.format(self.episode_count, self.episode_length, self.episode_return))
            self.episode_count += 1
            self.episodes_rewards.append(self.episode_rewards)
            self.episodes_returns = np.concatenate((self.episodes_returns, [self.episode_return]), axis=None)
            self.episodes_lengths = np.concatenate((self.episodes_lengths, [self.episode_length]), axis=None)
            if self.verbose == 2:
                print("rewards: " + str(self.episodes_returns))
                print("lengths: " + str(self.episodes_lengths))
            print("REWARD: " + str(self.get_episode_length()) )

        return obs, reward, done, info

    def get_episode_lengths(self):
        return self.episodes_lengths

    def get_episode_length(self):
        return self.episode_length

    def get_episode_rewards(self):
        return self.episodes_returns

    def get_episode_return(self):
        return self.episode_return
    def get_total_steps(self):
        return self.steps_count

    def get_total_episodes(self):
        return self.episode_count

class example(BaseCallback):
    def __init__(self, model, verbose=0):
        super(example, self).__init__(verbose)
        self.model = model
        self.training_env = model.get_env()

    def _on_training_start(self) -> None:
        pass

    def _on_step(self) -> bool:
        if self.locals["dones"]:
            print('Episode: {}, len episode: {}, return episode: {}'.format(
                *self.training_env.env_method("get_total_episodes"),
                *self.training_env.env_method("get_episode_length"),
                *self.training_env.env_method("get_episode_return")))
            print( type(*self.training_env.env_method("get_episode_length") ))

        return True`

env = SnekEnv()
env.reset()

wrapper = WrapperStatistics(env, 250, verbose=0)
wrapper.reset()

model = PPO('MlpPolicy', wrapper, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 1000000

evalcallback = example(model)
callbacks=[evalcallback]

model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO", callback=callbacks)