Closed ipsec closed 1 year ago
Hi again,
Congrats by excellent work.
My model is improving.
I'm loading the checkpoint with success and trying to predict (calling the policy function) the get an action using this observation format:
{'image': array([[[161, 255, 0], [161, 255, 0], [161, 255, 0], ..., [155, 255, 0], [155, 255, 0], [155, 255, 0]], [[161, 255, 0], [161, 255, 0], [161, 255, 0], ..., [155, 255, 0], [155, 255, 0], [155, 255, 0]], [[161, 255, 0], [161, 255, 0], [161, 255, 0], ..., [155, 255, 0], [155, 255, 0], [155, 255, 0]], ..., [[182, 255, 0], [182, 255, 0], [182, 255, 0], ..., [183, 255, 0], [183, 255, 0], [183, 255, 0]], [[182, 255, 0], [182, 255, 0], [182, 255, 0], ..., [183, 255, 0], [183, 255, 0], [183, 255, 0]], [[182, 255, 0], [182, 255, 0], [182, 255, 0], ..., [183, 255, 0], [183, 255, 0], [183, 255, 0]]], dtype=uint8), 'reward': 0.0, 'is_first': True, 'is_last': False, 'is_terminal': False}
My code:
import re import warnings import gym import logging import random from typing import Sequence import numpy as np import tensorflow as tf import dreamerv2.api as dv2 from dreamerv2 import common from dreamerv2.agent import Agent from pathlib import Path from agents import BaseAgent logger = logging.getLogger('root') warnings.filterwarnings('ignore', '.*box bound precision lowered.*') class Dreamerv2Agent(BaseAgent): def __init__(self, conf_file: Path, env: str, test_mode: bool, prefix: str, batch: int, model_path: Path = Path("~/logdir/trader"), seed: bool = False): super().__init__(env, test_mode, prefix, batch, model_path, seed) if self.seed: random.seed(0) np.random.seed(0) tf.random.set_seed(0) model_path = model_path.expanduser().absolute() print(f"Model Path: {model_path}") print("Loading config.") config_path = (model_path / 'config.yaml') config = common.Config.load(config_path) self.config = config print("Loading config. Done") env = gym.make(env) replay = common.Replay( model_path / 'train_episodes', **config.replay ) step = common.Counter(replay.stats['total_steps']) env = self.wrapper(env) def per_episode(ep): length = len(ep['reward']) - 1 score = float(ep['reward'].astype(np.float64).sum()) print(f'Episode has {length} steps and return {score:.1f}.') logger.scalar('return', score) logger.scalar('length', length) for key, value in ep.items(): if re.match(config.log_keys_sum, key): logger.scalar(f'sum_{key}', ep[key].sum()) if re.match(config.log_keys_mean, key): logger.scalar(f'mean_{key}', ep[key].mean()) if re.match(config.log_keys_max, key): logger.scalar(f'max_{key}', ep[key].max(0).mean()) logger.add(replay.stats) logger.write() driver = common.Driver([env]) driver.on_episode(per_episode) driver.on_step(lambda tran, worker: step.increment()) driver.on_step(replay.add_step) driver.on_reset(replay.add_step) prefill = max(0, config.prefill - replay.stats['total_steps']) if prefill: print(f'Prefill dataset ({prefill} steps).') random_agent = common.RandomAgent(env.act_space) driver(random_agent, steps=prefill, episodes=1) driver.reset() print(f'Create agent (step: {step.value}).') print(f"Action Space: {env.act_space}") print(f"Observation Space: {env.obs_space}") self.agent = Agent(config, env.obs_space, env.act_space, step) dataset = iter(replay.dataset(**config.dataset)) train_agent = common.CarryOverState(self.agent.train) train_agent(next(dataset)) print('Create agent. Done!') print('Loading checkpoint.') vars = (model_path / 'variables.pkl').absolute() if vars.exists(): self.agent.load(vars) print('Loading checkpoint. Done!') def wrapper(self, env): env = common.GymWrapper(env) env = common.ResizeImage(env) if hasattr(env.act_space['action'], 'n'): env = common.OneHotAction(env) else: env = common.NormalizeAction(env) env = common.TimeLimit(env, self.config.time_limit) return env def get_action(self, observation: Sequence): obs = {k: np.expand_dims(v, 0) for k, v in observation.items()} output, _ = self.agent.policy(obs, mode='eval') output['action'] = tf.squeeze(output['action']) return output
After calling get_action and getting a the action to pass to step from my gym environment (wrapped by the dreamerv2) and this works inside the loop.
But I'm getting always the same action from different observations.
Is something missing from my evaluation method?
Thanks in advanced.
Hi again,
Congrats by excellent work.
My model is improving.
I'm loading the checkpoint with success and trying to predict (calling the policy function) the get an action using this observation format:
My code:
After calling get_action and getting a the action to pass to step from my gym environment (wrapped by the dreamerv2) and this works inside the loop.
But I'm getting always the same action from different observations.
Is something missing from my evaluation method?
Thanks in advanced.