Closed DailiZhang2010 closed 1 year ago
Looks like the actions from the trained policy is out of spec. That might be the reason.
@DailiZhang2010 ,
Since your environment does not step through episodes, there is no episode_reward_mean that RLlib could calculate.
However, there should be a off_policy_estimator
key in your results that estimates it's performance.
If you still hit an issue with that info, please let us know.
Thanks, Artur.
However, I couldn't find the key off_policy_estimator in the results.
Here are the results.
{'evaluation': {'episode_reward_max': nan,
'episode_reward_min': nan,
'episode_reward_mean': nan},
'custom_metrics': {},
'episode_media': {},
'info': {'learner': {'default_policy': {'learner_stats': {'actor_loss': -142.2236328125,
'critic_loss': 1110728832.0,
'alpha_loss': -0.027799051254987717,
'alpha_value': array([1.0038443], dtype=float32),
'log_alpha_value': array([0.00383694], dtype=float32),
'target_entropy': array([-2.], dtype=float32),
'policy_t': -0.16004908084869385,
'mean_q': 235.7708282470703,
'max_q': 477.7535095214844,
'min_q': 77.32129669189453,
'cql_loss': 165.9615478515625},
'model': {},
'custom_metrics': {},
'num_agent_steps_trained': 1024,
'num_grad_updates_lifetime': 14,
'diff_num_grad_updates_vs_sampler_policy': 13}},
'num_env_steps_sampled': 14336,
'num_env_steps_trained': 14336,
'num_agent_steps_sampled': 14336,
'num_agent_steps_trained': 14336,
'last_target_update_ts': 14336,
'num_target_updates': 14},
'sampler_results': {'episode_reward_max': nan,
'episode_reward_min': nan,
'episode_reward_mean': nan,
'episode_len_mean': nan,
'episode_media': {},
'episodes_this_iter': 0,
'policy_reward_min': {},
'policy_reward_max': {},
'policy_reward_mean': {},
'custom_metrics': {},
'hist_stats': {'episode_reward': [], 'episode_lengths': []},
'sampler_perf': {},
'num_faulty_episodes': 0},
'episode_reward_max': nan,
'episode_reward_min': nan,
'episode_reward_mean': nan,
'episode_len_mean': nan,
'episodes_this_iter': 0,
'policy_reward_min': {},
'policy_reward_max': {},
'policy_reward_mean': {},
'hist_stats': {'episode_reward': [], 'episode_lengths': []},
'sampler_perf': {},
'num_faulty_episodes': 0,
'num_healthy_workers': 0,
'num_in_flight_async_reqs': 0,
'num_remote_worker_restarts': 0,
'num_agent_steps_sampled': 14336,
'num_agent_steps_trained': 14336,
'num_env_steps_sampled': 14336,
'num_env_steps_trained': 14336,
'num_env_steps_sampled_this_iter': 6144,
'num_env_steps_trained_this_iter': 6144,
'timesteps_total': 14336,
'num_steps_trained_this_iter': 6144,
'agent_timesteps_total': 14336,
'timers': {'training_iteration_time_ms': 199.16,
'sample_time_ms': 154.09,
'learn_time_ms': 44.27,
'learn_throughput': 23131.017,
'target_net_update_time_ms': 0.718},
'counters': {'num_env_steps_sampled': 14336,
'num_env_steps_trained': 14336,
'num_agent_steps_sampled': 14336,
'num_agent_steps_trained': 14336,
'last_target_update_ts': 14336,
'num_target_updates': 14},
'done': False,
'episodes_total': 0,
'training_iteration': 3,
'trial_id': 'default',
'experiment_id': '9793b9baf00f4096b97a87913f3b11d0',
'date': '2023-02-13_10-47-41',
'timestamp': 1676306861,
'time_this_iter_s': 1.1912569999694824,
'time_total_s': 3.3772130012512207,
'pid': 36959,
'hostname': 'm-yxm544xqhk',
'node_ip': '127.0.0.1',
'config': {'extra_python_environs_for_driver': {},
'extra_python_environs_for_worker': {},
'num_gpus': 0,
'num_cpus_per_worker': 1,
'num_gpus_per_worker': 0,
'_fake_gpus': False,
'custom_resources_per_worker': {},
'placement_strategy': 'PACK',
'eager_tracing': False,
'eager_max_retraces': 20,
'tf_session_args': {'intra_op_parallelism_threads': 2,
'inter_op_parallelism_threads': 2,
'gpu_options': {'allow_growth': True},
'log_device_placement': False,
'device_count': {'CPU': 1},
'allow_soft_placement': True},
'local_tf_session_args': {'intra_op_parallelism_threads': 8,
'inter_op_parallelism_threads': 8},
'env': None,
'env_config': {},
'observation_space': Box(0.0, [ 5000. 100000.], (2,), float32),
'action_space': Box(0.0, 5000.0, (2,), float32),
'env_task_fn': None,
'render_env': False,
'clip_rewards': None,
'normalize_actions': True,
'clip_actions': True,
'disable_env_checking': False,
'num_envs_per_worker': 1,
'sample_collector': ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector,
'sample_async': False,
'enable_connectors': False,
'rollout_fragment_length': 'auto',
'batch_mode': 'truncate_episodes',
'remote_worker_envs': False,
'remote_env_batch_wait_ms': 0,
'validate_workers_after_construction': True,
'ignore_worker_failures': False,
'recreate_failed_workers': False,
'restart_failed_sub_environments': False,
'num_consecutive_worker_failures_tolerance': 100,
'horizon': None,
'soft_horizon': False,
'no_done_at_end': False,
'preprocessor_pref': 'deepmind',
'observation_filter': 'NoFilter',
'synchronize_filters': True,
'compress_observations': False,
'enable_tf1_exec_eagerly': False,
'sampler_perf_stats_ema_coef': None,
'gamma': 0.99,
'lr': 0.001,
'train_batch_size': 1024,
'model': {'_use_default_native_models': False,
'_disable_preprocessor_api': False,
'_disable_action_flattening': False,
'fcnet_hiddens': [256, 256],
'fcnet_activation': 'tanh',
'conv_filters': None,
'conv_activation': 'relu',
'post_fcnet_hiddens': [],
'post_fcnet_activation': 'relu',
'free_log_std': False,
'no_final_linear': False,
'vf_share_layers': True,
'use_lstm': False,
'max_seq_len': 20,
'lstm_cell_size': 256,
'lstm_use_prev_action': False,
'lstm_use_prev_reward': False,
'_time_major': False,
'use_attention': False,
'attention_num_transformer_units': 1,
'attention_dim': 64,
'attention_num_heads': 1,
'attention_head_dim': 32,
'attention_memory_inference': 50,
'attention_memory_training': 50,
'attention_position_wise_mlp_dim': 32,
'attention_init_gru_gate_bias': 2.0,
'attention_use_n_prev_actions': 0,
'attention_use_n_prev_rewards': 0,
'framestack': True,
'dim': 84,
'grayscale': False,
'zero_mean': True,
'custom_model': None,
'custom_model_config': {},
'custom_action_dist': None,
'custom_preprocessor': None,
'lstm_use_prev_action_reward': -1},
'optimizer': {},
'max_requests_in_flight_per_sampler_worker': 2,
'explore': True,
'exploration_config': {'type': 'StochasticSampling'},
'input_config': {'paths': ['/Users/d0z02a0/Desktop/causalAnalysis/learning-agent/ibg_agents/data_out/opd-out'],
'format': 'json',
'num_cpus_per_read_task': 1,
'parallelism': 1},
'actions_in_input_normalized': False,
'postprocess_inputs': False,
'shuffle_buffer_size': 0,
'output': None,
'output_config': {},
'output_compress_columns': ['obs', 'new_obs'],
'output_max_file_size': 67108864,
'offline_sampling': False,
'evaluation_interval': None,
'evaluation_duration': 10,
'evaluation_duration_unit': 'episodes',
'evaluation_sample_timeout_s': 180.0,
'evaluation_parallel_to_training': False,
'evaluation_config': <ray.rllib.algorithms.cql.cql.CQLConfig at 0x3df7f72b0>,
'off_policy_estimation_methods': {},
'ope_split_batch_by_episode': True,
'evaluation_num_workers': 0,
'always_attach_evaluation_results': True,
'enable_async_evaluation': False,
'in_evaluation': False,
'sync_filters_on_rollout_workers_timeout_s': 60.0,
'keep_per_episode_custom_metrics': False,
'metrics_episode_collection_timeout_s': 60.0,
'metrics_num_episodes_for_smoothing': 100,
'min_time_s_per_iteration': 1,
'min_train_timesteps_per_iteration': 1000,
'min_sample_timesteps_per_iteration': 0,
'export_native_model_files': False,
'logger_creator': None,
'logger_config': None,
'log_level': 'INFO',
'log_sys_usage': True,
'fake_sampler': False,
'seed': None,
'worker_cls': None,
'_tf_policy_handles_more_than_one_loss': False,
'_disable_preprocessor_api': False,
'_disable_action_flattening': False,
'_disable_execution_plan_api': True,
'simple_optimizer': True,
'replay_sequence_length': None,
'twin_q': True,
'q_model_config': {'fcnet_hiddens': [64, 64],
'fcnet_activation': 'relu',
'post_fcnet_hiddens': [],
'post_fcnet_activation': None,
'custom_model': None,
'custom_model_config': {}},
'policy_model_config': {'fcnet_hiddens': [64, 64],
'fcnet_activation': 'relu',
'post_fcnet_hiddens': [],
'post_fcnet_activation': None,
'custom_model': None,
'custom_model_config': {}},
'tau': 0.005,
'initial_alpha': 1.0,
'target_entropy': 'auto',
'n_step': 3,
'replay_buffer_config': {'_enable_replay_buffer_api': True,
'type': 'MultiAgentPrioritizedReplayBuffer',
'capacity': 1000000,
'prioritized_replay': False,
'prioritized_replay_alpha': 0.6,
'prioritized_replay_beta': 0.4,
'prioritized_replay_eps': 1e-06,
'worker_side_prioritization': False},
'store_buffer_in_checkpoints': False,
'training_intensity': None,
'optimization': {'actor_learning_rate': 0.0003,
'critic_learning_rate': 0.0003,
'entropy_learning_rate': 0.0003},
'grad_clip': None,
'target_network_update_freq': 1,
'num_steps_sampled_before_learning_starts': 256,
'_deterministic_loss': False,
'_use_beta_distribution': False,
'use_state_preprocessor': -1,
'worker_side_prioritization': -1,
'bc_iters': 0,
'temperature': 1.0,
'num_actions': 10,
'lagrangian': False,
'lagrangian_thresh': 5.0,
'min_q_weight': 5.0,
'input': 'dataset',
'multiagent': {'policies': {'default_policy': <ray.rllib.policy.policy.PolicySpec at 0x3df84ff10>},
'policy_mapping_fn': <function ray.rllib.algorithms.algorithm_config.AlgorithmConfig.init.
@DailiZhang2010 ,
Since your environment does not step through episodes, there is no episode_reward_mean that RLlib could calculate.
However, there should be a
off_policy_estimator
key in your results that estimates it's performance.If you still hit an issue with that info, please let us know.
Would this also explain why the "episodes_this_iter" is not populated I.e., it only gets populated when using a simulator and not an offline dataset from a file. I had a similar question to OP. Thanks
yes that is correct @joshuaspear. closing this for now as it isn't a bug.
What happened + What you expected to happen
I have the offline data in the path_out I use the offline data for both of training and evaluation (I know it is not right, but just for the seek of making it work. :-) ) The code is simple. The majority of lines are just to setup the config and evaluation_config
The "eval_results["episode_reward_mean"]" is None Please see the script. I cannot share the data, but wonder if anyone can spot what setup I did wrong. Thanks.
Versions / Dependencies
Ray 2.2.0 OS: macOS Monterey Python: 3.9
Reproduction script
evaluation_config=( cql.CQLConfig() .offlinedata(input="dataset", input_config={ "paths": [path_out], "format": "json", } ) .evaluation( evaluation_interval=1, evaluation_duration=10, evaluation_num_workers=1, evaluation_duration_unit="episodes", evaluation_config={"input": path_out}, off_policy_estimation_methods={ "is": {"type": ImportanceSampling}, }, ) )
evaluation_config.action_space=gym.spaces.Box( low=np.array([0,0]), high=np.array([5000,5000]), dtype=np.float32) evaluation_config.observation_space=gym.spaces.Box( low=np.array([0,0]), high=np.array([5000,50000]), dtype=np.float32)
config = ( cql.CQLConfig() .framework(framework="torch") .rollouts(num_rollout_workers=0) .training( n_step=3, bc_iters=0, clip_actions=False, tau=0.005, target_entropy="auto", q_model_config={ "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", }, policy_model_config={ "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", }, optimization_config={ "actor_learning_rate": 3e-4, "critic_learning_rate": 3e-4, "entropy_learning_rate": 3e-4, }, train_batch_size=256, target_network_update_freq=1, num_steps_sampled_before_learning_starts=256, ) .reporting(min_train_timesteps_per_iteration=1000) .debugging(log_level="INFO") .offlinedata(input="dataset", input_config={ "paths": [path_out], "format": "json", } ) ) config.action_space=gym.spaces.Box( low=np.array([0,0]), high=np.array([5000,5000]), dtype=np.float32) config.observation_space=gym.spaces.Box( low=np.array([0,0]), high=np.array([5000,100000]), dtype=np.float32)
config.evaluation_config=evaluation_config config.always_attach_evaluation_results=True
num_iterations = 5 min_reward = -300
cql_algorithm = cql.CQL(config=config) learnt = False for i in range(num_iterations): print(f"Iter {i}") eval_results = cql_algorithm.train().get("evaluation") if eval_results: print("... R={}".format(eval_results["episode_reward_mean"])) if eval_results["episode_reward_mean"] >= min_reward: if args.as_test: print("Test passed after {} iterations.".format(i)) quit(0) learnt = True break
Issue Severity
None