danijar / dreamerv2

Mastering Atari with Discrete World Models
https://danijar.com/dreamerv2
MIT License
898 stars 195 forks source link

Reward different on evaluation #52

Closed ipsec closed 1 year ago

ipsec commented 1 year ago

Hi Danijar,

I'm training using dreamerv2 with success, and I'm getting this result:

[5847513] return 6.12 / length 151 / total_steps 5.8e6 / total_episodes 3.9e4 / loaded_steps 1e5 / loaded_episodes 662
Save checkpoint with 85 tensors and 32333580 parameters.
[5847536] kl_loss 0.67 / image_loss 1.1e4 / reward_loss 0.92 / discount_loss 0.06 / model_kl 0.67 / prior_ent 1.92 / post_ent 1.14 / model_loss 1.1e4 / model_grad_norm 133.28 / actor_loss -1.5e-5 / actor_grad_norm 1.5e-3 / critic_loss 0.82 / critic_grad_norm 0.07 / reward_mean 0.04 / reward_std 0.03 / reward_normed_mean 0.04 / reward_normed_std 0.03 / critic_slow 1.37 / critic_target 1.35 / actor_ent 2e-3 / actor_ent_scale 2e-3 / critic 1.37 / fps 44.95
Episode has 151 steps and return 6.1.

The return 6.1 is the cumulative sum of rewards of the episode?

After training I'm running the code below and I'm receiving the cumulative reward of the 2.153186.

image

I'm using this code to evaluate.

import re
import warnings
import gym
import logging
import random
from absl import logging
from typing import Sequence
import numpy as np
import tensorflow as tf
import dreamerv2.api as dv2
from dreamerv2 import common
from dreamerv2.agent import Agent

from pathlib import Path
from agents import BaseAgent

# logger = logging.getLogger('root')
# warnings.filterwarnings('ignore', '.*box bound precision lowered.*')

class Dreamerv2Agent(BaseAgent):
    def __init__(self,
                 conf_file: Path,
                 env: str,
                 test_mode: bool,
                 prefix: str,
                 batch: int,
                 model_path: Path = Path("~/logdir/trader"),
                 seed: bool = False):
        super().__init__(env, test_mode, prefix, batch, model_path, seed)

        if self.seed:
            random.seed(0)
            np.random.seed(0)
            tf.random.set_seed(0)

        model_path = model_path.expanduser().absolute()
        logging.error(f"Model Path: {model_path}")

        logging.error("Loading config.")
        config_path = (model_path / 'config.yaml')
        config = common.Config.load(config_path)
        self.config = config

        logging.error("Loading config. Done")

        env = gym.make(env)

        replay = common.Replay(
            model_path / 'train_episodes',
            **config.replay
        )
        step = common.Counter(replay.stats['total_steps'])
        env = self.wrapper(env)

        def per_episode(ep):
            length = len(ep['reward']) - 1
            score = float(ep['reward'].astype(np.float64).sum())
            logging.error(f'Episode has {length} steps and return {score:.1f}.')
            # logger.scalar('return', score)
            # logger.scalar('length', length)
            for key, value in ep.items():
                if re.match(config.log_keys_sum, key):
                    logging.error.scalar(f'sum_{key}', ep[key].sum())
                if re.match(config.log_keys_mean, key):
                    logging.error.scalar(f'mean_{key}', ep[key].mean())
                if re.match(config.log_keys_max, key):
                    logging.error.scalar(f'max_{key}', ep[key].max(0).mean())
            # logger.add(replay.stats)
            # logger.write()

        driver = common.Driver([env])
        driver.on_episode(per_episode)
        driver.on_step(lambda tran, worker: step.increment())
        driver.on_step(replay.add_step)
        driver.on_reset(replay.add_step)

        prefill = max(0, config.prefill - replay.stats['total_steps'])
        if prefill:
            print(f'Prefill dataset ({prefill} steps).')
            random_agent = common.RandomAgent(env.act_space)
            driver(random_agent, steps=prefill, episodes=1)
            driver.reset()

        logging.error(f'Create agent (step: {step.value}).')
        logging.error(f"Action Space: {env.act_space}")
        logging.error(f"Observation Space: {env.obs_space}")
        self.agent = Agent(config, env.obs_space, env.act_space, step)
        dataset = iter(replay.dataset(**config.dataset))
        train_agent = common.CarryOverState(self.agent.train)
        train_agent(next(dataset))
        logging.error('Create agent. Done!')

        logging.error('Loading checkpoint.')
        vars = (model_path / 'variables.pkl').absolute()
        if vars.exists():
            self.agent.load(vars)
        logging.error('Loading checkpoint. Done!')

    def wrapper(self, env):
        env = common.GymWrapper(env)
        env = common.ResizeImage(env)
        if hasattr(env.act_space['action'], 'n'):
            env = common.OneHotAction(env)
        else:
            env = common.NormalizeAction(env)
        env = common.TimeLimit(env, self.config.time_limit)
        return env

    def get_action(self, observation: Sequence):
        obs = {k: np.expand_dims(v, 0) for k, v in observation.items()}
        output, _ = self.agent.policy(obs, mode='eval')
        output['action'] = tf.squeeze(output['action'])
        return output

My config.

action_repeat: 1
actor: {act: elu, dist: auto, layers: 4, min_std: 0.1, norm: none, units: 400}
actor_ent: 0.002
actor_grad: auto
actor_grad_mix: 0.1
actor_opt: {clip: 100, eps: 1e-05, lr: 8e-05, opt: adam, wd: 1e-06}
atari_grayscale: false
clip_rewards: tanh
critic: {act: elu, dist: mse, layers: 4, norm: none, units: 400}
critic_opt: {clip: 100, eps: 1e-05, lr: 0.0002, opt: adam, wd: 1e-06}
dataset: {batch: 16, length: 50}
decoder:
  act: elu
  cnn_depth: 48
  cnn_kernels: [5, 5, 6, 6]
  cnn_keys: .*
  mlp_keys: .*
  mlp_layers: [400, 400, 400, 400]
  norm: none
disag_action_cond: true
disag_log: false
disag_models: 10
disag_offset: 1
disag_target: stoch
discount: 0.99
discount_head: {act: elu, dist: binary, layers: 4, norm: none, units: 400}
discount_lambda: 0.95
dmc_camera: -1
encoder:
  act: elu
  cnn_depth: 48
  cnn_kernels: [4, 4, 4, 4]
  cnn_keys: .*
  mlp_keys: .*
  mlp_layers: [400, 400, 400, 400]
  norm: none
envs: 1
envs_parallel: none
eval_eps: 1
eval_every: 1000.0
eval_noise: 0.0
eval_state_mean: false
expl_behavior: greedy
expl_extr_scale: 0.0
expl_head: {act: elu, dist: mse, layers: 4, norm: none, units: 400}
expl_intr_scale: 1.0
expl_model_loss: kl
expl_noise: 0.0
expl_opt: {clip: 100, eps: 1e-05, lr: 0.0003, opt: adam, wd: 1e-06}
expl_reward_norm: {eps: 1e-08, momentum: 1.0, scale: 1.0}
expl_until: 0
grad_heads: [decoder, reward, discount]
imag_horizon: 15
jit: true
kl: {balance: 0.8, forward: false, free: 0.0, free_avg: true}
log_every: 10000.0
log_keys_max: ^$
log_keys_mean: ^$
log_keys_sum: ^$
log_keys_video: [image]
logdir: ~/logdir/trader
loss_scales: {discount: 1.0, kl: 1.0, proprio: 1.0, reward: 1.0}
model_opt: {clip: 100, eps: 1e-05, lr: 0.0001, opt: adam, wd: 1e-06}
precision: 16
pred_discount: true
prefill: 10000
pretrain: 1
render_size: [64, 64]
replay: {capacity: 100000.0, maxlen: 50, minlen: 50, ongoing: false, prioritize_ends: true}
reward_head: {act: elu, dist: mse, layers: 4, norm: none, units: 400}
reward_norm: {eps: 1e-08, momentum: 1.0, scale: 1.0}
rssm: {act: elu, deter: 1024, discrete: 32, ensemble: 1, hidden: 1024, min_std: 0.1,
  norm: none, std_act: sigmoid2, stoch: 32}
seed: 0
slow_baseline: true
slow_target: true
slow_target_fraction: 1
slow_target_update: 100
steps: 100000000.0
task: dmc_walker_walk
time_limit: 0
train_every: 1000
train_steps: 1

There are something I'm missing to get a better evaluation?

Best regards, Fernando Ribeiro

ipsec commented 1 year ago

I found the problem. The policy method needs the state parameter to predict properly