ray-project / ray

Ray is an AI compute engine. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
33.97k stars 5.77k forks source link

How to specify seeds and pendulum-ddpg.yaml works poorly #3678

Closed joneswong closed 5 years ago

joneswong commented 5 years ago

System information

Describe the problem

Source code / logs

the following is the last line of result.json

{"episode_reward_max": -1.8801368400780225, "episode_reward_min": -1518.8103287551426, "episode_reward_mean": -477.8532551843094, "episode_len_mean": 200.0, "episodes_this_iter": 3, "policy_reward_mean": {}, "custom_metrics": {}, "num_metric_batches_dropped": 0, "info": {"min_exploration": 0.0020000000000000018, "max_exploration": 0.0020000000000000018, "num_target_updates": 22800, "num_steps_trained": 1427200, "num_steps_sampled": 22800, "sample_time_ms": 1.715, "replay_time_ms": 13.885, "grad_time_ms": 9.886, "update_time_ms": 0.002, "opt_peak_throughput": 6473.8, "opt_samples": 64.0, "learner": {}}, "timesteps_this_iter": 600, "done": false, "timesteps_total": 22800, "episodes_total": 114, "experiment_id": "46fd7f20db4f49babb25d5480369fb91", "date": "2019-01-03_11-44-55", "timestamp": 1546487095, "training_iteration": 38, "time_this_iter_s": 15.73029088973999, "time_total_s": 576.9666073322296, "pid": 28297, "hostname": "gpu0222.et2", "node_ip": "10.183.24.88", "config": {"monitor": false, "log_level": "INFO", "callbacks": {"on_episode_start": null, "on_episode_step": null, "on_episode_end": null, "on_sample_end": null, "on_train_result": null}, "model": {"conv_filters": null, "conv_activation": "relu", "fcnet_activation": "tanh", "fcnet_hiddens": [256, 256], "free_log_std": false, "squash_to_range": false, "use_lstm": false, "max_seq_len": 20, "lstm_cell_size": 256, "lstm_use_prev_action_reward": false, "framestack": true, "dim": 84, "channel_major": false, "grayscale": false, "zero_mean": true, "custom_preprocessor": null, "custom_model": null, "custom_options": {}}, "optimizer": {"buffer_size": 10000, "prioritized_replay": true, "prioritized_replay_alpha": 0.6, "prioritized_replay_beta": 0.4, "prioritized_replay_eps": 1e-06, "sample_batch_size": 1, "train_batch_size": 64, "learning_starts": 500}, "gamma": 0.99, "horizon": null, "env_config": {}, "env": "Pendulum-v0", "clip_rewards": false, "clip_actions": true, "preprocessor_pref": "deepmind", "num_workers": 0, "num_gpus": 0, "num_cpus_per_worker": 1, "num_gpus_per_worker": 0, "custom_resources_per_worker": {}, "num_cpus_for_driver": 1, "num_envs_per_worker": 1, "sample_batch_size": 1, "train_batch_size": 64, "batch_mode": "truncate_episodes", "sample_async": false, "observation_filter": "NoFilter", "synchronize_filters": true, "tf_session_args": {"intra_op_parallelism_threads": 2, "inter_op_parallelism_threads": 2, "gpu_options": {"allow_growth": true}, "log_device_placement": false, "device_count": {"CPU": 1}, "allow_soft_placement": true}, "local_evaluator_tf_session_args": {"intra_op_parallelism_threads": 8, "inter_op_parallelism_threads": 8}, "compress_observations": false, "collect_metrics_timeout": 180, "input": "sampler", "input_evaluation": null, "output": null, "output_compress_columns": ["obs", "new_obs"], "output_max_file_size": 67108864, "multiagent": {"policy_graphs": {}, "policy_mapping_fn": null, "policies_to_train": null}, "twin_q": false, "policy_delay": 1, "smooth_target_policy": false, "act_noise": 0.1, "target_noise": 0.2, "noise_clip": 0.5, "actor_hiddens": [64, 64], "actor_hidden_activation": "relu", "critic_hiddens": [64, 64], "critic_hidden_activation": "relu", "n_step": 1, "schedule_max_timesteps": 100000, "timesteps_per_iteration": 600, "exploration_fraction": 0.1, "exploration_final_eps": 0.02, "noise_scale": 0.1, "exploration_theta": 0.15, "exploration_sigma": 0.2, "target_network_update_freq": 0, "tau": 0.001, "buffer_size": 10000, "prioritized_replay": true, "prioritized_replay_alpha": 0.6, "prioritized_replay_beta": 0.4, "prioritized_replay_eps": 1e-06, "lr": 0.001, "actor_loss_coeff": 0.1, "critic_loss_coeff": 1.0, "use_huber": true, "huber_threshold": 1.0, "l2_reg": 1e-06, "grad_norm_clipping": null, "learning_starts": 500, "optimizer_class": "SyncReplayOptimizer", "per_worker_exploration": false, "worker_side_prioritization": false, "min_iter_time_s": 1}, "time_since_restore": 576.9666073322296, "timesteps_since_restore": 22800, "iterations_since_restore": 38}

ericl commented 5 years ago

There isn't a way to seed currently (issue https://github.com/ray-project/ray/issues/2776)

pendulum-ddpg works OK for me, it just seems to take ~30k timesteps: (./train.py -f tuned_examples/pendulum-ddpg.yaml) image

joneswong commented 5 years ago

oh, my machine is relatively slow and within 600s, it just samples around 23k time steps. Thx!

I think enabling seeds specification is quite important for RL exps. We'd better address that issue ASAP.