Prediction returning the same action from different observations

Hi again,

Congrats by excellent work.

My model is improving.

I'm loading the checkpoint with success and trying to predict (calling the policy function) the get an action using this observation format:

{'image': array([[[161, 255,   0],
        [161, 255,   0],
        [161, 255,   0],
        ...,
        [155, 255,   0],
        [155, 255,   0],
        [155, 255,   0]],

       [[161, 255,   0],
        [161, 255,   0],
        [161, 255,   0],
        ...,
        [155, 255,   0],
        [155, 255,   0],
        [155, 255,   0]],

       [[161, 255,   0],
        [161, 255,   0],
        [161, 255,   0],
        ...,
        [155, 255,   0],
        [155, 255,   0],
        [155, 255,   0]],

       ...,

       [[182, 255,   0],
        [182, 255,   0],
        [182, 255,   0],
        ...,
        [183, 255,   0],
        [183, 255,   0],
        [183, 255,   0]],

       [[182, 255,   0],
        [182, 255,   0],
        [182, 255,   0],
        ...,
        [183, 255,   0],
        [183, 255,   0],
        [183, 255,   0]],

       [[182, 255,   0],
        [182, 255,   0],
        [182, 255,   0],
        ...,
        [183, 255,   0],
        [183, 255,   0],
        [183, 255,   0]]], dtype=uint8), 'reward': 0.0, 'is_first': True, 'is_last': False, 'is_terminal': False}

My code:

import re
import warnings
import gym
import logging
import random
from typing import Sequence
import numpy as np
import tensorflow as tf
import dreamerv2.api as dv2
from dreamerv2 import common
from dreamerv2.agent import Agent

from pathlib import Path
from agents import BaseAgent

logger = logging.getLogger('root')
warnings.filterwarnings('ignore', '.*box bound precision lowered.*')

class Dreamerv2Agent(BaseAgent):
    def __init__(self,
                 conf_file: Path,
                 env: str,
                 test_mode: bool,
                 prefix: str,
                 batch: int,
                 model_path: Path = Path("~/logdir/trader"),
                 seed: bool = False):
        super().__init__(env, test_mode, prefix, batch, model_path, seed)

        if self.seed:
            random.seed(0)
            np.random.seed(0)
            tf.random.set_seed(0)

        model_path = model_path.expanduser().absolute()
        print(f"Model Path: {model_path}")

        print("Loading config.")
        config_path = (model_path / 'config.yaml')
        config = common.Config.load(config_path)
        self.config = config

        print("Loading config. Done")

        env = gym.make(env)

        replay = common.Replay(
            model_path / 'train_episodes',
            **config.replay
        )
        step = common.Counter(replay.stats['total_steps'])
        env = self.wrapper(env)

        def per_episode(ep):
            length = len(ep['reward']) - 1
            score = float(ep['reward'].astype(np.float64).sum())
            print(f'Episode has {length} steps and return {score:.1f}.')
            logger.scalar('return', score)
            logger.scalar('length', length)
            for key, value in ep.items():
                if re.match(config.log_keys_sum, key):
                    logger.scalar(f'sum_{key}', ep[key].sum())
                if re.match(config.log_keys_mean, key):
                    logger.scalar(f'mean_{key}', ep[key].mean())
                if re.match(config.log_keys_max, key):
                    logger.scalar(f'max_{key}', ep[key].max(0).mean())
            logger.add(replay.stats)
            logger.write()

        driver = common.Driver([env])
        driver.on_episode(per_episode)
        driver.on_step(lambda tran, worker: step.increment())
        driver.on_step(replay.add_step)
        driver.on_reset(replay.add_step)

        prefill = max(0, config.prefill - replay.stats['total_steps'])
        if prefill:
            print(f'Prefill dataset ({prefill} steps).')
            random_agent = common.RandomAgent(env.act_space)
            driver(random_agent, steps=prefill, episodes=1)
            driver.reset()

        print(f'Create agent (step: {step.value}).')
        print(f"Action Space: {env.act_space}")
        print(f"Observation Space: {env.obs_space}")
        self.agent = Agent(config, env.obs_space, env.act_space, step)
        dataset = iter(replay.dataset(**config.dataset))
        train_agent = common.CarryOverState(self.agent.train)
        train_agent(next(dataset))
        print('Create agent. Done!')

        print('Loading checkpoint.')
        vars = (model_path / 'variables.pkl').absolute()
        if vars.exists():
            self.agent.load(vars)
        print('Loading checkpoint. Done!')

    def wrapper(self, env):
        env = common.GymWrapper(env)
        env = common.ResizeImage(env)
        if hasattr(env.act_space['action'], 'n'):
            env = common.OneHotAction(env)
        else:
            env = common.NormalizeAction(env)
        env = common.TimeLimit(env, self.config.time_limit)
        return env

    def get_action(self, observation: Sequence):
        obs = {k: np.expand_dims(v, 0) for k, v in observation.items()}
        output, _ = self.agent.policy(obs, mode='eval')
        output['action'] = tf.squeeze(output['action'])
        return output

After calling get_action and getting a the action to pass to step from my gym environment (wrapped by the dreamerv2) and this works inside the loop.

But I'm getting always the same action from different observations.

Is something missing from my evaluation method?

Thanks in advanced.

danijar / dreamerv2

Prediction returning the same action from different observations #48