Issue with Reward Function in CartPole-v1 Environment

toshima1051 commented 1 month ago

Hi DreamerV3 team,

I'm currently using DreamerV3 to train an agent on the CartPole-v1 environment using the Gymnasium interface. However, I've encountered an issue where the rewards do not seem to be correctly processed, resulting in poor training performance. Here are the details:

Environment:

OS: Ubuntu 22.04 (WSL2)
Python version: 3.10
Gymnasium version: latest
DreamerV3 version: latest

Code:

train_dreamerv3.py:

import warnings
from functools import partial as bind
import dreamerv3
import embodied
import gymnasium as gym

warnings.filterwarnings('ignore', '.*truncated to dtype int32.*')

def main():
    config = embodied.Config(dreamerv3.Agent.configs['defaults'])
    config = config.update({
        **dreamerv3.Agent.configs['size100m'],
        'logdir': f'~/logdir/{embodied.timestamp()}-cartpole',
        'run.train_ratio': 32,
    })
    config = embodied.Flags(config).parse()

    print('Logdir:', config.logdir)
    logdir = embodied.Path(config.logdir)
    logdir.mkdir()
    config.save(logdir / 'config.yaml')

    def make_agent(config):
        env = make_env(config)
        agent = dreamerv3.Agent(env.obs_space, env.act_space, config)
        env.close()
        return agent

    def make_logger(config):
        logdir = embodied.Path(config.logdir)
        return embodied.Logger(embodied.Counter(), [
            embodied.logger.TerminalOutput(config.filter),
            embodied.logger.JSONLOutput(logdir, 'metrics.jsonl'),
            embodied.logger.TensorBoardOutput(logdir),
        ])

    def make_replay(config):
        return embodied.replay.Replay(
            length=config.batch_length,
            capacity=config.replay.size,
            directory=embodied.Path(config.logdir) / 'replay',
            online=config.replay.online)

    def make_env(config, env_id=0):
        from embodied.envs import from_gym
        env = gym.make('CartPole-v1')
        env = from_gym.FromGym(env)
        env = dreamerv3.wrap_env(env, config)
        return env

    args = embodied.Config(
        **config.run,
        logdir=config.logdir,
        batch_size=config.batch_size,
        batch_length=config.batch_length,
        batch_length_eval=config.batch_length_eval,
        replay_context=config.replay_context,
    )

    embodied.run.train(
        bind(make_agent, config),
        bind(make_replay, config),
        bind(make_env, config),
        bind(make_logger, config), args)

if __name__ == '__main__':
    main()

from_gym.py:

import functools
import gymnasium as gym
import numpy as np
import embodied

class FromGym(embodied.Env):

    def __init__(self, env, obs_key='image', act_key='action', **kwargs):
        if isinstance(env, str):
            self._env = gym.make(env, **kwargs)
        else:
            assert not kwargs, kwargs
            self._env = env
        self._obs_dict = hasattr(self._env.observation_space, 'spaces')
        self._act_dict = hasattr(self._env.action_space, 'spaces')
        self._obs_key = obs_key
        self._act_key = act_key
        self._done = True
        self._info = None

    @property
    def env(self):
        return self._env

    @property
    def info(self):
        return self._info

    @functools.cached_property
    def obs_space(self):
        if self._obs_dict:
            spaces = self._flatten(self._env.observation_space.spaces)
        else:
            spaces = {self._obs_key: self._env.observation_space}
        spaces = {k: self._convert(v) for k, v in spaces.items()}
        return {
            **spaces,
            'reward': embodied.Space(np.float32),
            'is_first': embodied.Space(bool),
            'is_last': embodied.Space(bool),
            'is_terminal': embodied.Space(bool),
        }

    @functools.cached_property
    def act_space(self):
        if self._act_dict:
            spaces = self._flatten(self._env.action_space.spaces)
        else:
            spaces = {self._act_key: self._env.action_space}
        spaces = {k: self._convert(v) for k, v in spaces.items()}
        spaces['reset'] = embodied.Space(bool)
        return spaces

    def step(self, action):
        if action['reset'] or self._done:
            self._done = False
            obs, info = self._env.reset()
            return self._obs(obs, 0.0, is_first=True)
        if self._act_dict:
            action = self._unflatten(action)
        else:
            action = action[self._act_key]
        obs, reward, terminated, truncated, info = self._env.step(action)
        self._done = terminated or truncated
        self._info = info
        return self._obs(
            obs, reward,
            is_last=bool(self._done),
            is_terminal=bool(info.get('is_terminal', terminated)))

    def _obs(self, obs, reward, is_first=False, is_last=False, is_terminal=False):
        if not self._obs_dict:
            obs = {self._obs_key: obs}
        obs = self._flatten(obs)
        obs = {k: np.asarray(v) for k, v in obs.items()}
        obs.update(
            reward=np.float32(reward),
            is_first=is_first,
            is_last=is_last,
            is_terminal=is_terminal)
        return obs

    def render(self):
        image = self._env.render()
        assert image is not None
        return image

    def close(self):
        try:
            self._env.close()
        except Exception:
            pass

    def _flatten(self, nest, prefix=None):
        result = {}
        for key, value in nest.items():
            key = prefix + '/' + key if prefix else key
            if isinstance(value, gym.spaces.Dict):
                value = value.spaces
            if isinstance(value, dict):
                result.update(self._flatten(value, key))
            else:
                result[key] = value
        return result

    def _unflatten(self, flat):
        result = {}
        for key, value in flat.items():
            parts = key.split('/')
            node = result
            for part in parts[:-1]:
                if part not in node:
                    node[part] = {}
                node = node[part]
            node[parts[-1]] = value
        return result

    def _convert(self, space):
        if hasattr(space, 'n'):
            return embodied.Space(np.int32, (), 0, space.n)
        return embodied.Space(space.dtype, space.shape, space.low, space.high)

Issue: Despite following the standard implementation, the reward seems to be consistently zero, leading to poor learning performance. Below are the TensorBoard logs showing the metrics during training:

スクリーンショット 2024-07-22 043001

Steps to Reproduce:

Execute train_dreamerv3.py with the provided code. Observe the reward metrics in TensorBoard. Expected Behavior: The agent should receive and optimize based on the reward values provided by the environment, resulting in improved episode lengths and scores over time.

Actual Behavior: The rewards remain zero, and the agent does not seem to learn effectively.

Any guidance or suggestions on resolving this issue would be greatly appreciated.

Thank you!

danijar commented 1 month ago

The episode/score summary shows the return and it seems to be learning well!

The epstats/reward_rate summary is a measure of how dense the reward is and unrelated.

toshima1051 commented 1 month ago

Thank you very much! I'm glad you are learning successfully, I had no idea that epstats/reward_rate was an irrelevant value! I will use this success as a springboard for further machine learning with Dreanerv3 in the future. I am a first time learner and may have more questions, but I would appreciate your help at that time! Thank you very much!

danijar / dreamerv3

Issue with Reward Function in CartPole-v1 Environment #145