Closed ipsec closed 1 year ago
Hi Danijar,
I'm training using dreamerv2 with success, and I'm getting this result:
[5847513] return 6.12 / length 151 / total_steps 5.8e6 / total_episodes 3.9e4 / loaded_steps 1e5 / loaded_episodes 662 Save checkpoint with 85 tensors and 32333580 parameters. [5847536] kl_loss 0.67 / image_loss 1.1e4 / reward_loss 0.92 / discount_loss 0.06 / model_kl 0.67 / prior_ent 1.92 / post_ent 1.14 / model_loss 1.1e4 / model_grad_norm 133.28 / actor_loss -1.5e-5 / actor_grad_norm 1.5e-3 / critic_loss 0.82 / critic_grad_norm 0.07 / reward_mean 0.04 / reward_std 0.03 / reward_normed_mean 0.04 / reward_normed_std 0.03 / critic_slow 1.37 / critic_target 1.35 / actor_ent 2e-3 / actor_ent_scale 2e-3 / critic 1.37 / fps 44.95 Episode has 151 steps and return 6.1.
The return 6.1 is the cumulative sum of rewards of the episode?
After training I'm running the code below and I'm receiving the cumulative reward of the 2.153186.
I'm using this code to evaluate.
import re import warnings import gym import logging import random from absl import logging from typing import Sequence import numpy as np import tensorflow as tf import dreamerv2.api as dv2 from dreamerv2 import common from dreamerv2.agent import Agent from pathlib import Path from agents import BaseAgent # logger = logging.getLogger('root') # warnings.filterwarnings('ignore', '.*box bound precision lowered.*') class Dreamerv2Agent(BaseAgent): def __init__(self, conf_file: Path, env: str, test_mode: bool, prefix: str, batch: int, model_path: Path = Path("~/logdir/trader"), seed: bool = False): super().__init__(env, test_mode, prefix, batch, model_path, seed) if self.seed: random.seed(0) np.random.seed(0) tf.random.set_seed(0) model_path = model_path.expanduser().absolute() logging.error(f"Model Path: {model_path}") logging.error("Loading config.") config_path = (model_path / 'config.yaml') config = common.Config.load(config_path) self.config = config logging.error("Loading config. Done") env = gym.make(env) replay = common.Replay( model_path / 'train_episodes', **config.replay ) step = common.Counter(replay.stats['total_steps']) env = self.wrapper(env) def per_episode(ep): length = len(ep['reward']) - 1 score = float(ep['reward'].astype(np.float64).sum()) logging.error(f'Episode has {length} steps and return {score:.1f}.') # logger.scalar('return', score) # logger.scalar('length', length) for key, value in ep.items(): if re.match(config.log_keys_sum, key): logging.error.scalar(f'sum_{key}', ep[key].sum()) if re.match(config.log_keys_mean, key): logging.error.scalar(f'mean_{key}', ep[key].mean()) if re.match(config.log_keys_max, key): logging.error.scalar(f'max_{key}', ep[key].max(0).mean()) # logger.add(replay.stats) # logger.write() driver = common.Driver([env]) driver.on_episode(per_episode) driver.on_step(lambda tran, worker: step.increment()) driver.on_step(replay.add_step) driver.on_reset(replay.add_step) prefill = max(0, config.prefill - replay.stats['total_steps']) if prefill: print(f'Prefill dataset ({prefill} steps).') random_agent = common.RandomAgent(env.act_space) driver(random_agent, steps=prefill, episodes=1) driver.reset() logging.error(f'Create agent (step: {step.value}).') logging.error(f"Action Space: {env.act_space}") logging.error(f"Observation Space: {env.obs_space}") self.agent = Agent(config, env.obs_space, env.act_space, step) dataset = iter(replay.dataset(**config.dataset)) train_agent = common.CarryOverState(self.agent.train) train_agent(next(dataset)) logging.error('Create agent. Done!') logging.error('Loading checkpoint.') vars = (model_path / 'variables.pkl').absolute() if vars.exists(): self.agent.load(vars) logging.error('Loading checkpoint. Done!') def wrapper(self, env): env = common.GymWrapper(env) env = common.ResizeImage(env) if hasattr(env.act_space['action'], 'n'): env = common.OneHotAction(env) else: env = common.NormalizeAction(env) env = common.TimeLimit(env, self.config.time_limit) return env def get_action(self, observation: Sequence): obs = {k: np.expand_dims(v, 0) for k, v in observation.items()} output, _ = self.agent.policy(obs, mode='eval') output['action'] = tf.squeeze(output['action']) return output
My config.
action_repeat: 1 actor: {act: elu, dist: auto, layers: 4, min_std: 0.1, norm: none, units: 400} actor_ent: 0.002 actor_grad: auto actor_grad_mix: 0.1 actor_opt: {clip: 100, eps: 1e-05, lr: 8e-05, opt: adam, wd: 1e-06} atari_grayscale: false clip_rewards: tanh critic: {act: elu, dist: mse, layers: 4, norm: none, units: 400} critic_opt: {clip: 100, eps: 1e-05, lr: 0.0002, opt: adam, wd: 1e-06} dataset: {batch: 16, length: 50} decoder: act: elu cnn_depth: 48 cnn_kernels: [5, 5, 6, 6] cnn_keys: .* mlp_keys: .* mlp_layers: [400, 400, 400, 400] norm: none disag_action_cond: true disag_log: false disag_models: 10 disag_offset: 1 disag_target: stoch discount: 0.99 discount_head: {act: elu, dist: binary, layers: 4, norm: none, units: 400} discount_lambda: 0.95 dmc_camera: -1 encoder: act: elu cnn_depth: 48 cnn_kernels: [4, 4, 4, 4] cnn_keys: .* mlp_keys: .* mlp_layers: [400, 400, 400, 400] norm: none envs: 1 envs_parallel: none eval_eps: 1 eval_every: 1000.0 eval_noise: 0.0 eval_state_mean: false expl_behavior: greedy expl_extr_scale: 0.0 expl_head: {act: elu, dist: mse, layers: 4, norm: none, units: 400} expl_intr_scale: 1.0 expl_model_loss: kl expl_noise: 0.0 expl_opt: {clip: 100, eps: 1e-05, lr: 0.0003, opt: adam, wd: 1e-06} expl_reward_norm: {eps: 1e-08, momentum: 1.0, scale: 1.0} expl_until: 0 grad_heads: [decoder, reward, discount] imag_horizon: 15 jit: true kl: {balance: 0.8, forward: false, free: 0.0, free_avg: true} log_every: 10000.0 log_keys_max: ^$ log_keys_mean: ^$ log_keys_sum: ^$ log_keys_video: [image] logdir: ~/logdir/trader loss_scales: {discount: 1.0, kl: 1.0, proprio: 1.0, reward: 1.0} model_opt: {clip: 100, eps: 1e-05, lr: 0.0001, opt: adam, wd: 1e-06} precision: 16 pred_discount: true prefill: 10000 pretrain: 1 render_size: [64, 64] replay: {capacity: 100000.0, maxlen: 50, minlen: 50, ongoing: false, prioritize_ends: true} reward_head: {act: elu, dist: mse, layers: 4, norm: none, units: 400} reward_norm: {eps: 1e-08, momentum: 1.0, scale: 1.0} rssm: {act: elu, deter: 1024, discrete: 32, ensemble: 1, hidden: 1024, min_std: 0.1, norm: none, std_act: sigmoid2, stoch: 32} seed: 0 slow_baseline: true slow_target: true slow_target_fraction: 1 slow_target_update: 100 steps: 100000000.0 task: dmc_walker_walk time_limit: 0 train_every: 1000 train_steps: 1
There are something I'm missing to get a better evaluation?
Best regards, Fernando Ribeiro
I found the problem. The policy method needs the state parameter to predict properly
Hi Danijar,
I'm training using dreamerv2 with success, and I'm getting this result:
The return 6.1 is the cumulative sum of rewards of the episode?
After training I'm running the code below and I'm receiving the cumulative reward of the 2.153186.
I'm using this code to evaluate.
My config.
There are something I'm missing to get a better evaluation?
Best regards, Fernando Ribeiro