Closed Alex-Golod closed 11 months ago
Good morning!
I also have found out some strange things with model weights after restoring algorithm from checkpoint .
With the updated Restore.py:
from tqdm import trange import numpy as np from ray.rllib.algorithms import Algorithm
env_name = "CartPole-v1" algo = Algorithm.from_checkpoint("-".join(("checkpoint", env_name))) rl_module = algo.workers.local_worker().module weights=weights2={} for layer in rl_module.layers: weights[layer.name] = layer.get_weights() # returns a list of numpy arrays print(f"Weights of layer {layer.name}: {weights[layer.name]}")
algo2 = Algorithm.from_checkpoint("-".join(("checkpoint", env_name))) rl_module2 = algo2.workers.local_worker().module for layer in rl_module2.layers: weights2[layer.name] = layer.get_weights() # returns a list of numpy arrays print(f"Weights of layer {layer.name}: {weights[layer.name]}")
for layer in rl_module2.layers: comparison_result = np.array_equal(weights[layer.name], weights2[layer.name]) print(f"Weights of layer {layer.name} comparison = {comparison_result}")
for epoch in range(10): result = algo.train() print("epoch=%(epoch)d reward_max=%(reward_max)f reward_mean=%(reward_mean)f" % { "epoch": epoch, "reward_max": result["episode_reward_max"], "reward_mean": result["episode_reward_mean"], })
I have following results:
Weights of layer vector_encoder comparison = False Weights of layer vector_decoder comparison = False Weights of layer world_model comparison = False Weights of layer actor comparison = False Weights of layer critic comparison = False Weights of layer dreamer_model comparison = False
For PPO algorithm the picture is a bit different.
TrainPPO.py
import gymnasium as gym import numpy as np from tqdm import trange from ray.rllib.algorithms.ppo import PPOConfig
env_name = "CartPole-v1"
config = PPOConfig() config = config.training( gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size=128) config = config.resources(num_gpus=0) onfig = config.rollouts(num_rollout_workers=1)
algo = config.build(env="CartPole-v1")
iterator = trange(100) for epoch in iterator: result = algo.train() iterator.set_postfix({ "reward_max": result["episode_reward_max"], "reward_mean": result["episode_reward_mean"], })
print(result) save_result = algo.save("-".join(("checkpointPPO", env_name))) path_to_checkpoint = save_result.checkpoint.path print( "An Algorithm checkpoint has been created inside directory: " f"'{path_to_checkpoint}'." ) algo.stop()
RestorePPO.py
from tqdm import trange import numpy as np from ray.rllib.algorithms import Algorithm
env_name = "CartPole-v1" algo1 = Algorithm.from_checkpoint("-".join(("checkpointPPO", env_name))) ppo_policy1= algo1.workers.local_worker().get_policy() ppo_model_weights1 = ppo_policy1.get_weights() print(ppo_model_weights1) # Do something with
algo2 = Algorithm.from_checkpoint("-".join(("checkpointPPO", env_name))) ppo_policy2= algo2.workers.local_worker().get_policy() ppo_model_weights2 = ppo_policy1.get_weights() print(ppo_model_weights2) # Do something with
for key in ppo_model_weights1.keys(): comparison_result = np.array_equal(ppo_model_weights1[key], ppo_model_weights2[key]) print(f"Weights of layer {key} comparison = {comparison_result}")
for epoch in range(10): result = algo1.train() print("epoch=%(epoch)d reward_max=%(reward_max)f reward_mean=%(reward_mean)f" % { "epoch": epoch, "reward_max": result["episode_reward_max"], "reward_mean": result["episode_reward_mean"], })
Results:
Weights of layer encoder.actor_encoder.net.mlp.0.weight comparison = True Weights of layer encoder.actor_encoder.net.mlp.0.bias comparison = True Weights of layer encoder.actor_encoder.net.mlp.2.weight comparison = True Weights of layer encoder.actor_encoder.net.mlp.2.bias comparison = True Weights of layer encoder.critic_encoder.net.mlp.0.weight comparison = True Weights of layer encoder.critic_encoder.net.mlp.0.bias comparison = True Weights of layer encoder.critic_encoder.net.mlp.2.weight comparison = True Weights of layer encoder.critic_encoder.net.mlp.2.bias comparison = True Weights of layer pi.net.mlp.0.weight comparison = True Weights of layer pi.net.mlp.0.bias comparison = True Weights of layer vf.net.mlp.0.weight comparison = True Weights of layer vf.net.mlp.0.bias comparison = True
Thanks for the fix @simonsays1980 !
What happened + What you expected to happen
Good morning!
As in #40347 I have the same problem for the DreamerV3 algorithm.
By the train.py script I made 500 iterations of training and have rewards: 500/500 [57:15<00:00, 6.87s/it, reward_max=238, reward_mean=238], 494/500 [56:35<00:39, 6.62s/it, reward_max=173, reward_mean=173], 490/500 [56:08<01:02, 6.23s/it, reward_max=229, reward_mean=229]. After that I saved the algorithm to checkpoint "An Algorithm checkpoint has been created inside directory: 'checkpoint-CartPole-v1'."
After that I restored the algorithm from that checkpoint and made 10 iterations of training by restore.py script. I received following rewards: epoch=0 reward_max=40.000000 reward_mean=20.058824 epoch=1 reward_max=19.000000 reward_mean=18.666667 epoch=2 reward_max=23.000000 reward_mean=19.000000 epoch=3 reward_max=22.000000 reward_mean=16.250000 epoch=4 reward_max=21.000000 reward_mean=14.600000 epoch=5 reward_max=15.000000 reward_mean=13.500000 epoch=6 reward_max=20.000000 reward_mean=13.800000 epoch=7 reward_max=37.000000 reward_mean=27.000000 epoch=8 reward_max=27.000000 reward_mean=14.400000 epoch=9 reward_max=17.000000 reward_mean=13.800000
Please, help me to solve this problem. I put Issue Severity:"High", because, we cannot restore our algorithms from checkpoints to continue training and have to train them from the beginning, if the train was stopped for some reasons. Thank you in advance.
Kind regards, Alexander.
Versions / Dependencies
dependencies = [ "numpy>=1.26.1", "black>=23.10.1", "ruff>=0.1.4", "isort>=5.12.0", "pytest>=7.4.3", "pdm>=2.10.1", "ray[all,tune]==2.7.1", "torch>=2.1.0", "mlflow>=2.8.0", "ipython>=8.17.2", "grpcio==1.53.0", "tqdm>=4.66.1", "pygame>=2.5.2", "accelerate>=0.24.1", "pandas>=2.1.2", "scipy>=1.11.3", "statsmodels>=0.14.0", "matplotlib>=3.8.1", "seaborn>=0.13.0", "pytorch-lightning>=2.1.1", "optuna>=3.4.0", "PyArrow>=13.0.0", "requests>=2.31.0", "docker>=6.1.3", "gpustat>=1.1.1", "pytest-cov>=4.1.0", "py-spy>=0.3.14", "boto3>=1.28.82", "lz4>=4.3.2", "tensorflow>=2.14.0", "tensorflow-probability>=0.22.1", "dm-tree>=0.1.8", "shimmy[gym]>=1.3.0", "supersuit>=3.9.0", "pettingzoo>=1.24.1", "scikit-image>=0.22.0", "pre-commit>=3.5.0", "yq>=3.2.3", "gdown>=4.7.1", "gym>=0.26.2", ]
Reproduction script
train.py
import gymnasium as gym import numpy as np from tqdm import trange from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
env_name = "CartPole-v1"
config = ( DreamerV3Config() .environment(env_name) .training( model_size="XS", training_ratio=64, ) .rollouts( rollout_fragment_length=64, ) )
algo = config.build()
iterator = trange(500) for epoch in iterator: result = algo.train() iterator.set_postfix({ "reward_max": result["episode_reward_max"], "reward_mean": result["episode_reward_mean"], })
print(result) save_result = algo.save("-".join(("checkpoint", env_name))) path_to_checkpoint = save_result.checkpoint.path print( "An Algorithm checkpoint has been created inside directory: " f"'{path_to_checkpoint}'." )
algo.stop()
restore.py
from tqdm import trange from ray.rllib.algorithms import Algorithm
env_name = "CartPole-v1" algo = Algorithm.from_checkpoint("-".join(("checkpoint", env_name)))
for epoch in range(10): result = algo.train() print("epoch=%(epoch)d reward_max=%(reward_max)f reward_mean=%(reward_mean)f" % { "epoch": epoch, "reward_max": result["episode_reward_max"], "reward_mean": result["episode_reward_mean"], })
Issue Severity
High: It blocks me from completing my task.