Closed yamatokataoka closed 2 years ago
All final shapes right before training
4000 is the number of steps for each epoch 4 is obs dimension
observations.shape: torch.Size([4000, 4])
actions.shape: torch.Size([4000])
advantages.shape: torch.Size([4000])
discounted_returns.shape: torch.Size([4000])
All experiences separated by each episode Collecting experience with list and numpy
def collect_one_epoch_experience(self, steps_per_epoch: int) -> OneEpochExperience:
one_epoch_experience: OneEpochExperience = {
"observations": [],
"actions": [],
"rewards": [],
"observations_with_last_observations": [],
"episode_returns": [],
"episode_lengths": [],
}
# Variables on an episode
episode_observations: List[np.ndarray] = []
episode_actions: List[np.ndarray] = []
episode_rewards: List[float] = []
episode_observations_with_last_observations: List[float] = []
episode_return: float = 0.0
episode_length: int = 0
observation: np.ndarray = self.env.reset()
for current_step in range(steps_per_epoch):
episode_observations.append(observation)
episode_observations_with_last_observations.append(observation)
observation_tensor: Tensor = torch.from_numpy(observation).float()
with torch.no_grad():
policy_dist: Distribution = self.policy(observation_tensor)
action: Tensor = policy_dist.sample()
action_ndarray = action.detach().numpy()
episode_actions.append(action_ndarray)
reward: float
episode_done: bool
observation, reward, episode_done, _ = self.env.step(action_ndarray)
episode_return += reward
episode_rewards.append(reward)
episode_length += 1
self.current_total_steps += 1
epoch_ended: bool = current_step == steps_per_epoch - 1
if episode_done or epoch_ended:
if epoch_ended and not (episode_done):
logger.debug(
"The trajectory cut off at {} steps on the current episode".format(
episode_length
)
)
episode_observations_with_last_observations.append(observation)
one_epoch_experience["observations"].append(
episode_observations
)
one_epoch_experience["actions"].append(episode_actions)
one_epoch_experience["rewards"].append(episode_rewards)
one_epoch_experience["observations_with_last_observations"].append(
episode_observations_with_last_observations
)
one_epoch_experience["episode_returns"].append(episode_return)
one_epoch_experience["episode_lengths"].append(episode_length)
if episode_done:
self.current_total_episodes += 1
observation = self.env.reset()
episode_return, episode_length = 0.0, 0
episode_observations, episode_actions, episode_rewards, episode_observations_with_last_observations = [], [], [], []
return one_epoch_experience
Preprocessing before training
def train(self, one_epoch_experience: OneEpochExperience) -> None:
observations_list: List[List[np.ndarray]] = one_epoch_experience["observations"]
actions_list: List[List[np.ndarray]] = one_epoch_experience["actions"]
rewards_list: List[List[float]] = one_epoch_experience["rewards"]
observations_with_last_observations_list: List[np.ndarray] = one_epoch_experience["observations_with_last_observations"]
# Calculate rewards-to-go over each episode, to be targets for the value function
discounted_returns: Tensor = torch.from_numpy(np.concatenate([
discount_cumulative_sum(one_episode_rewards, self.gamma)
for one_episode_rewards in rewards_list
])).float()
# Calculate advantages
observations = torch.from_numpy(np.concatenate(observations_list)).float()
actions = torch.from_numpy(np.concatenate(actions_list)).float()
values_tensor_list: List[Tensor] = []
with torch.no_grad():
for observations_with_last_observation in observations_with_last_observations_list:
observations_with_last_observation = torch.from_numpy(np.stack(observations_with_last_observation)).float()
values_tensor_list.append(self.value_function(observations_with_last_observation).flatten())
advantages: Tensor = torch.from_numpy(np.concatenate([
gae(one_episode_rewards, self.gamma, one_episode_values.numpy(), self.gae_lambda)
for one_episode_rewards, one_episode_values in zip(rewards_list, values_tensor_list)
])).float()
All experiences not separated by each episode Collecting experience with list and Tensor
def collect_one_epoch_experience(self, steps_per_epoch: int) -> OneEpochExperience:
observations: List[Tensor] = []
actions: List[Tensor] = []
rewards: List[float] = []
observations_with_last_observations: List[Tensor] = []
episode_returns: List[float] = []
episode_lengths: List[int] = []
# Variables on an episode
episode_return: float = 0.0
episode_length: int = 0
observation: np.ndarray = self.env.reset()
for current_step in range(steps_per_epoch):
observation_tensor: Tensor = torch.from_numpy(observation).float()
observations.append(observation_tensor)
observations_with_last_observations.append(observation_tensor)
with torch.no_grad():
policy_dist: Distribution = self.policy(observation_tensor)
action: Tensor = policy_dist.sample()
actions.append(action)
reward: float
episode_done: bool
observation, reward, episode_done, _ = self.env.step(action.detach().numpy())
episode_return += reward
rewards.append(reward)
episode_length += 1
self.current_total_steps += 1
epoch_ended: bool = current_step == steps_per_epoch - 1
if episode_done or epoch_ended:
if epoch_ended and not (episode_done):
logger.debug(
"The trajectory cut off at {} steps on the current episode".format(
episode_length
)
)
observations_with_last_observations.append(torch.from_numpy(observation).float())
episode_returns.append(episode_return)
episode_lengths.append(episode_length)
if episode_done:
self.current_total_episodes += 1
observation = self.env.reset()
episode_return, episode_length = 0.0, 0
one_epoch_experience: OneEpochExperience = {
"observations": torch.stack(observations),
"actions": torch.stack(actions),
"rewards": torch.tensor(rewards),
"observations_with_last_observations": torch.stack(observations_with_last_observations),
"episode_returns": episode_returns,
"episode_lengths": episode_lengths,
}
return one_epoch_experience
Preprocessing before training
def train(self, one_epoch_experience: OneEpochExperience) -> None:
observations: Tensor = one_epoch_experience["observations"]
actions: Tensor = one_epoch_experience["actions"]
rewards: Tensor = one_epoch_experience["rewards"]
observations_with_last_observations: Tensor = one_epoch_experience["observations_with_last_observations"]
episode_lengths: List[float] = one_epoch_experience["episode_lengths"]
# Calculate rewards-to-go over each episode, to be targets for the value function
current_index = 0
discounted_returns = torch.empty(rewards.shape)
for episode_length in episode_lengths:
one_episode_rewards = rewards.narrow(0, current_index, episode_length)
discounted_returns[current_index:current_index + episode_length] = discount_cumulative_sum(one_episode_rewards, self.gamma)
current_index += episode_length
with torch.no_grad():
values = self.value_function(observations_with_last_observations).flatten()
current_index = 0
advantages: Tensor = torch.empty(rewards.shape)
for n_episode, episode_length in enumerate(episode_lengths):
one_episode_rewards = rewards.narrow(0, current_index, episode_length)
one_episode_values = values.narrow(0, current_index + n_episode, episode_length + 1)
advantages[current_index:current_index + episode_length] = gae(one_episode_rewards, self.gamma, one_episode_values, self.gae_lambda)
current_index += episode_length
Implemented cumulative sum with decay factor in PyTorch
https://discuss.pytorch.org/t/cumulative-sum-with-decay-factor/69788
def discount_cumulative_sum(vector: Tensor, discount: float) -> Tensor:
"""
Compute discounted cumulative sums of vector.
:param vector: (Tensor) An target vector
e.g. [x0,
x1,
x2]
:param discount: (float) The discount factor for the cumulative return
:return: (Tensor) discounted cumulative sums of an vector
e.g. [x0 + discount * x1 + discount^2 * x2,
x1 + discount * x2,
x2]
"""
discounted_sequence = discount ** torch.arange(vector.size(0))
discounted_cumulative_sums = torch.cumsum((discounted_sequence * vector).flip(0), dim=0).flip(0) / discounted_sequence
return discounted_cumulative_sums
def gae(
rewards: Tensor, gamma: float, values: Tensor, gae_lambda: float
) -> Tensor:
"""
Compute Generalized Advantage Estimation (GAE)
:param rewards: (Tensor) Rewards for all states
:param gamma: (float) The discount factor for the cumulative return
:param values: (Tensor) Values for all states
:param gae_lambda: (float) A smoothing parameter for reducing the variance
:return gaes: (Tensor) GAEs for all states
"""
deltas: np.ndarray = rewards + gamma * values[1:] - values[:-1]
gaes: Tensor = discount_cumulative_sum(deltas, gamma * gae_lambda)
return gaes
Implementations comparison
epochs: 200 steps per epoch: 4000 algorithm: VPG env: Cartpole seed: 0 compute: Colab CPU
Implementation | 1 | 2 | 3 |
---|---|---|---|
current | Epoch: 199 Total steps: 8e+05 Total episodes: 1.1e+04 Average Episode Return: 186 Episode Return STD: 25.4 Max Episode Return: 200 Min Episode Return: 107 Average Episode Length: 182 Time: 487 Policy Loss: -0.00292 Avarage Entropy: 0.555 Log Prob STD: 0.447 Value Function Loss: 245 |
Epoch: 199 Total steps: 8e+05 Total episodes: 9.78e+03 Average Episode Return: 193 Episode Return STD: 20.5 Max Episode Return: 200 Min Episode Return: 130 Average Episode Length: 190 Time: 486 Policy Loss: -0.0115 Avarage Entropy: 0.55 Log Prob STD: 0.463 Value Function Loss: 144 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.13e+04 Average Episode Return: 194 Episode Return STD: 20.8 Max Episode Return: 200 Min Episode Return: 106 Average Episode Length: 190 Time: 479 Policy Loss: -0.00688 Avarage Entropy: 0.549 Log Prob STD: 0.45 Value Function Loss: 256 |
experience per episode with numpy |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.09e+04 Average Episode Return: 190 Episode Return STD: 24 Max Episode Return: 200 Min Episode Return: 121 Average Episode Length: 190 Time: 404 Policy Loss: -0.00548 Avarage Entropy: 0.554 Log Prob STD: 0.451 Value Function Loss: 286 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.39e+04 Average Episode Return: 190 Episode Return STD: 19.8 Max Episode Return: 200 Min Episode Return: 132 Average Episode Length: 190 Time: 401 Policy Loss: -0.0114 Avarage Entropy: 0.561 Log Prob STD: 0.444 Value Function Loss: 230 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.31e+04 Average Episode Return: 190 Episode Return STD: 23.9 Max Episode Return: 200 Min Episode Return: 111 Average Episode Length: 190 Time: 406 Policy Loss: -0.00812 Avarage Entropy: 0.555 Log Prob STD: 0.45 Value Function Loss: 251 |
experience per episode with Tensor |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.03e+04 Average Episode Return: 190 Episode Return STD: 19.5 Max Episode Return: 200 Min Episode Return: 132 Average Episode Length: 190 Time: 452 Policy Loss: -0.00352 Avarage Entropy: 0.549 Log Prob STD: 0.454 Value Function Loss: 225 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.06e+04 Average Episode Return: 182 Episode Return STD: 38 Max Episode Return: 200 Min Episode Return: 67 Average Episode Length: 182 Time: 450 Policy Loss: -0.0197 Avarage Entropy: 0.548 Log Prob STD: 0.462 Value Function Loss: 251 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.15e+04 Average Episode Return: 190 Episode Return STD: 26.8 Max Episode Return: 200 Min Episode Return: 82 Average Episode Length: 190 Time: 462 Policy Loss: -0.00983 Avarage Entropy: 0.551 Log Prob STD: 0.459 Value Function Loss: 260 |
experience with Tensor |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.29e+04 Average Episode Return: 182 Episode Return STD: 29.3 Max Episode Return: 200 Min Episode Return: 115 Average Episode Length: 182 Time: 402 Policy Loss: -0.0218 Avarage Entropy: 0.556 Log Prob STD: 0.445 Value Function Loss: 189 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.21e+04 Average Episode Return: 190 Episode Return STD: 23.4 Max Episode Return: 200 Min Episode Return: 120 Average Episode Length: 190 Time: 401 Policy Loss: -0.000805 Avarage Entropy: 0.549 Log Prob STD: 0.451 Value Function Loss: 197 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.25e+04 Average Episode Return: 182 Episode Return STD: 34.4 Max Episode Return: 200 Min Episode Return: 47 Average Episode Length: 182 Time: 405 Policy Loss: -0.0313 Avarage Entropy: 0.552 Log Prob STD: 0.455 Value Function Loss: 239 |
All experiences separated by each episode Collecting experience with list and Tensor
def collect_one_epoch_experience(self, steps_per_epoch: int) -> OneEpochExperience:
one_epoch_experience: OneEpochExperience = {
"observations": [],
"actions": [],
"rewards": [],
"observations_with_last_observations": [],
"episode_returns": [],
"episode_lengths": [],
}
# Variables on an episode
episode_observations: List[Tensor] = []
episode_actions: List[Tensor] = []
episode_rewards: List[float] = []
episode_observations_with_last_observations: List[float] = []
episode_return: float = 0.0
episode_length: int = 0
observation: np.ndarray = self.env.reset()
for current_step in range(steps_per_epoch):
observation_tensor: Tensor = torch.from_numpy(observation).float()
episode_observations.append(observation_tensor)
episode_observations_with_last_observations.append(observation_tensor)
with torch.no_grad():
policy_dist: Distribution = self.policy(observation_tensor)
action: Tensor = policy_dist.sample()
episode_actions.append(action)
reward: float
episode_done: bool
observation, reward, episode_done, _ = self.env.step(action.detach().numpy())
episode_return += reward
episode_rewards.append(reward)
episode_length += 1
self.current_total_steps += 1
epoch_ended: bool = current_step == steps_per_epoch - 1
if episode_done or epoch_ended:
if epoch_ended and not (episode_done):
logger.debug(
"The trajectory cut off at {} steps on the current episode".format(
episode_length
)
)
episode_observations_with_last_observations.append(torch.from_numpy(observation).float())
one_epoch_experience["observations"].append(
episode_observations
)
one_epoch_experience["actions"].append(episode_actions)
one_epoch_experience["rewards"].append(episode_rewards)
one_epoch_experience["observations_with_last_observations"].append(
episode_observations_with_last_observations
)
one_epoch_experience["episode_returns"].append(episode_return)
one_epoch_experience["episode_lengths"].append(episode_length)
if episode_done:
self.current_total_episodes += 1
observation = self.env.reset()
episode_return, episode_length = 0.0, 0
episode_observations, episode_actions, episode_rewards, episode_observations_with_last_observations = [], [], [], []
return one_epoch_experience
Preprocessing before training
def train(self, one_epoch_experience: OneEpochExperience) -> None:
observations_list: List[List[Tensor]] = one_epoch_experience["observations"]
actions_list: List[List[Tensor]] = one_epoch_experience["actions"]
rewards_list: List[List[float]] = one_epoch_experience["rewards"]
observations_with_last_observations_list: List[List[Tensor]] = one_epoch_experience["observations_with_last_observations"]
# Calculate rewards-to-go over each episode, to be targets for the value function
discounted_returns: Tensor = torch.cat([
discount_cumulative_sum(torch.tensor(one_episode_rewards), self.gamma)
for one_episode_rewards in rewards_list
]).float()
# Calculate advantages
observations = torch.cat([torch.stack(episode_observations) for episode_observations in observations_list]).float()
actions = torch.cat([torch.stack(episode_actions) for episode_actions in actions_list]).float()
values_tensor_list: List[Tensor] = []
with torch.no_grad():
for observations_with_last_observation in observations_with_last_observations_list:
observations_with_last_observation = torch.stack(observations_with_last_observation).float()
values_tensor_list.append(self.value_function(observations_with_last_observation).flatten())
advantages: Tensor = torch.cat([
gae(torch.tensor(one_episode_rewards), self.gamma, one_episode_values, self.gae_lambda)
for one_episode_rewards, one_episode_values in zip(rewards_list, values_tensor_list)
]).float()
Todos
cProfile the current implementation
test inst-nodeps: /content/tox/.tmp/package/1/rl-replicas-0.0.3.tar.gz
test installed: absl-py==1.0.0,attrs==21.4.0,cachetools==4.2.4,certifi==2021.10.8,charset-normalizer==2.0.12,cloudpickle==2.0.0,dataclasses==0.8,google-auth==2.6.0,google-auth-oauthlib==0.4.6,grpcio==1.44.0,gym==0.21.0,idna==3.3,importlib-metadata==4.8.3,iniconfig==1.1.1,Markdown==3.3.6,numpy==1.19.5,oauthlib==3.2.0,packaging==21.3,pluggy==1.0.0,protobuf==3.19.4,py==1.11.0,pyasn1==0.4.8,pyasn1-modules==0.2.8,pyparsing==3.0.7,pytest==7.0.1,requests==2.27.1,requests-oauthlib==1.3.1,rl-replicas @ file:///content/tox/.tmp/package/1/rl-replicas-0.0.3.tar.gz,rsa==4.8,scipy==1.5.4,six==1.16.0,tensorboard==2.8.0,tensorboard-data-server==0.6.1,tensorboard-plugin-wit==1.8.1,tomli==1.2.3,torch==1.10.2,typing_extensions==4.1.1,urllib3==1.26.8,Werkzeug==2.0.3,zipp==3.6.0
test run-test-pre: PYTHONHASHSEED='2722520008'
test run-test: commands[0] | python -m cProfile -s cumtime tests/integration_tests/test_vpg.py
48351144 function calls (45091207 primitive calls) in 146.650 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
1854/1 0.074 0.000 146.654 146.654 {built-in method builtins.exec}
1 0.000 0.000 146.654 146.654 test_vpg.py:1(<module>)
1 0.000 0.000 144.559 144.559 test_vpg.py:19(test_vpg_with_cartpole)
1 0.645 0.645 144.551 144.551 on_policy_algorithm.py:75(learn)
50 4.189 0.084 118.579 2.372 on_policy_algorithm.py:169(collect_one_epoch_experience)
3637350/404150 5.969 0.000 82.178 0.000 module.py:1096(_call_impl)
200050 0.856 0.000 50.446 0.000 categorical_policy.py:19(forward)
404150 0.706 0.000 50.138 0.000 mlp.py:33(forward)
404150 3.789 0.000 48.324 0.000 container.py:139(forward)
204100 0.498 0.000 30.005 0.000 value_function.py:19(forward)
1212450 2.677 0.000 28.252 0.000 linear.py:102(forward)
200050 3.029 0.000 27.628 0.000 categorical.py:49(__init__)
200000 0.596 0.000 26.580 0.000 time_limit.py:14(step)
200000 5.154 0.000 25.984 0.000 cartpole.py:103(step)
50 0.161 0.003 25.119 0.502 vpg.py:53(train)
1212450 1.321 0.000 24.676 0.000 functional.py:1832(linear)
1212450 22.969 0.000 22.969 0.000 {built-in method torch._C._nn.linear}
200000 0.979 0.000 18.281 0.000 arrayprint.py:1365(_array_repr_implementation)
200000 0.926 0.000 16.721 0.000 arrayprint.py:516(array2string)
200050 2.856 0.000 14.809 0.000 distribution.py:34(__init__)
200015 0.797 0.000 13.942 0.000 arrayprint.py:461(wrapper)
200000 1.547 0.000 12.873 0.000 arrayprint.py:478(_array2string)
4050 0.035 0.000 12.048 0.003 vpg.py:115(compute_value_loss)
808300 0.498 0.000 10.882 0.000 activation.py:348(forward)
808300 10.383 0.000 10.383 0.000 {built-in method tanh}
4050 0.030 0.000 9.943 0.002 _tensor.py:251(backward)
200000 1.117 0.000 9.914 0.000 arrayprint.py:409(_get_format_function)
4050 0.061 0.000 9.913 0.002 __init__.py:69(backward)
4050 9.731 0.002 9.731 0.002 {method 'run_backward' of 'torch._C._EngineBase' objects}
200050 9.351 0.000 9.351 0.000 {method 'logsumexp' of 'torch._C._TensorBase' objects}
400100 2.449 0.000 8.883 0.000 constraints.py:201(check)
200000 1.706 0.000 8.855 0.000 categorical.py:108(sample)
200000 0.242 0.000 7.987 0.000 arrayprint.py:365(<lambda>)
200000 1.753 0.000 7.745 0.000 arrayprint.py:1124(__init__)
419371 0.410 0.000 5.264 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
400100 1.042 0.000 4.155 0.000 utils.py:106(__get__)
200050 0.298 0.000 4.006 0.000 <__array_function__ internals>:2(amax)
406436 1.572 0.000 3.966 0.000 fromnumeric.py:70(_wrapreduction)
800250 3.965 0.000 3.965 0.000 {method 'all' of 'torch._C._TensorBase' objects}
200050 0.449 0.000 3.406 0.000 fromnumeric.py:2589(amax)
800100 3.321 0.000 3.321 0.000 {method 'reshape' of 'torch._C._TensorBase' objects}
4050 0.093 0.000 2.226 0.001 optimizer.py:83(wrapper)
200050 0.149 0.000 2.135 0.000 categorical.py:92(probs)
1411/9 0.012 0.000 2.098 0.233 <frozen importlib._bootstrap>:966(_find_and_load)
1411/9 0.009 0.000 2.097 0.233 <frozen importlib._bootstrap>:936(_find_and_load_unlocked)
1360/8 0.009 0.000 2.092 0.262 <frozen importlib._bootstrap>:651(_load_unlocked)
1234/8 0.005 0.000 2.092 0.262 <frozen importlib._bootstrap_external>:672(exec_module)
1857/9 0.002 0.000 2.092 0.232 <frozen importlib._bootstrap>:211(_call_with_frames_removed)
406739 2.064 0.000 2.064 0.000 {method 'reduce' of 'numpy.ufunc' objects}
115 0.003 0.000 2.056 0.018 __init__.py:1(<module>)
200050 0.237 0.000 1.986 0.000 utils.py:65(logits_to_probs)
4050 0.043 0.000 1.903 0.000 grad_mode.py:25(decorate_context)
3233348 1.859 0.000 1.860 0.000 module.py:1164(__getattr__)
400100 1.852 0.000 1.852 0.000 constraints.py:300(check)
3637350 1.782 0.000 1.782 0.000 {built-in method torch._C._get_tracing_state}
4050 0.164 0.000 1.764 0.000 adam.py:81(step)
200050 0.293 0.000 1.749 0.000 functional.py:1650(softmax)
432276 1.674 0.000 1.674 0.000 {built-in method numpy.array}
200000 1.639 0.000 1.639 0.000 {built-in method multinomial}
200000 0.723 0.000 1.517 0.000 arrayprint.py:60(_make_options_dict)
4050 0.438 0.000 1.502 0.000 _functional.py:54(adam)
200050 0.173 0.000 1.475 0.000 <__array_function__ internals>:2(amin)
200050 1.417 0.000 1.417 0.000 {method 'softmax' of 'torch._C._TensorBase' objects}
687/94 0.002 0.000 1.406 0.015 {built-in method builtins.__import__}
9 0.000 0.000 1.369 0.152 __init__.py:3(<module>)
12672 0.089 0.000 1.296 0.000 utils.py:10(discount_cumulative_sum)
6336 0.110 0.000 1.264 0.000 utils.py:38(gae)
12672 0.067 0.000 1.207 0.000 signaltools.py:1719(lfilter)
5573/1912 0.007 0.000 1.159 0.001 <frozen importlib._bootstrap>:997(_handle_fromlist)
400750 1.124 0.000 1.124 0.000 {method 'detach' of 'torch._C._TensorBase' objects}
200050 0.298 0.000 1.121 0.000 fromnumeric.py:2714(amin)
658222 0.415 0.000 1.107 0.000 {built-in method builtins.getattr}
200000 0.343 0.000 1.065 0.000 arrayprint.py:709(_formatArray)
12672 0.983 0.000 0.983 0.000 {built-in method scipy.signal.sigtools._linear_filter}
200000 0.768 0.000 0.913 0.000 discrete.py:22(contains)
200150 0.889 0.000 0.889 0.000 {built-in method from_numpy}
3 0.002 0.001 0.887 0.296 __init__.py:10(<module>)
204162 0.340 0.000 0.768 0.000 grad_mode.py:128(__exit__)
204162 0.351 0.000 0.759 0.000 grad_mode.py:124(__enter__)
404150 0.461 0.000 0.758 0.000 container.py:131(__iter__)
408324 0.477 0.000 0.743 0.000 grad_mode.py:213(__init__)
204191 0.679 0.000 0.737 0.000 grad_mode.py:119(__init__)
200000 0.454 0.000 0.722 0.000 arrayprint.py:718(recurser)
200000 0.656 0.000 0.656 0.000 arrayprint.py:358(_get_formatdict)
1 0.000 0.000 0.647 0.647 ddpg.py:1(<module>)
1 0.000 0.000 0.644 0.644 off_policy_algorithm.py:1(<module>)
100 0.637 0.006 0.637 0.006 {built-in method stack}
1510786/1510580 0.584 0.000 0.587 0.000 {built-in method builtins.isinstance}
200000 0.581 0.000 0.581 0.000 arrayprint.py:1304(dtype_is_implied)
200032 0.501 0.000 0.501 0.000 {built-in method builtins.locals}
1360/1342 0.004 0.000 0.491 0.000 <frozen importlib._bootstrap>:564(module_from_spec)
cProfile the new implementation Experiences separated by each episode Collecting experience with list and numpy
test inst-nodeps: /content/tox/.tmp/package/1/rl-replicas-0.0.3.tar.gz
test installed: absl-py==1.0.0,attrs==21.4.0,cachetools==4.2.4,certifi==2021.10.8,charset-normalizer==2.0.12,cloudpickle==2.0.0,dataclasses==0.8,google-auth==2.6.0,google-auth-oauthlib==0.4.6,grpcio==1.44.0,gym==0.21.0,idna==3.3,importlib-metadata==4.8.3,iniconfig==1.1.1,Markdown==3.3.6,numpy==1.19.5,oauthlib==3.2.0,packaging==21.3,pluggy==1.0.0,protobuf==3.19.4,py==1.11.0,pyasn1==0.4.8,pyasn1-modules==0.2.8,pyparsing==3.0.7,pytest==7.0.1,requests==2.27.1,requests-oauthlib==1.3.1,rl-replicas @ file:///content/tox/.tmp/package/1/rl-replicas-0.0.3.tar.gz,rsa==4.8,scipy==1.5.4,six==1.16.0,tensorboard==2.8.0,tensorboard-data-server==0.6.1,tensorboard-plugin-wit==1.8.1,tomli==1.2.3,torch==1.10.2,typing_extensions==4.1.1,urllib3==1.26.8,Werkzeug==2.0.3,zipp==3.6.0
test run-test-pre: PYTHONHASHSEED='2987822608'
test run-test: commands[0] | python -m cProfile -s cumtime tests/integration_tests/test_vpg.py
38901310 function calls (37186072 primitive calls) in 125.180 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
1854/1 0.071 0.000 125.184 125.184 {built-in method builtins.exec}
1 0.001 0.001 125.184 125.184 test_vpg.py:1(<module>)
1 0.008 0.008 122.998 122.998 test_vpg.py:19(test_vpg_with_cartpole)
1 0.463 0.463 122.982 122.982 on_policy_algorithm.py:75(learn)
50 3.676 0.074 96.028 1.921 on_policy_algorithm.py:169(collect_one_epoch_experience)
1892601/210289 3.317 0.000 64.557 0.000 module.py:1096(_call_impl)
200050 0.878 0.000 51.720 0.000 categorical_policy.py:19(forward)
210289 0.404 0.000 34.361 0.000 mlp.py:33(forward)
210289 2.514 0.000 33.300 0.000 container.py:139(forward)
200050 2.908 0.000 27.452 0.000 categorical.py:49(__init__)
50 0.214 0.004 26.157 0.523 vpg.py:58(train)
200000 0.602 0.000 25.967 0.000 time_limit.py:14(step)
200000 4.956 0.000 25.365 0.000 cartpole.py:103(step)
630867 1.522 0.000 18.773 0.000 linear.py:102(forward)
200000 0.946 0.000 17.855 0.000 arrayprint.py:1365(_array_repr_implementation)
630867 0.736 0.000 16.720 0.000 functional.py:1832(linear)
200000 0.944 0.000 16.376 0.000 arrayprint.py:516(array2string)
630867 15.751 0.000 15.751 0.000 {built-in method torch._C._nn.linear}
200050 2.794 0.000 14.458 0.000 distribution.py:34(__init__)
200015 0.788 0.000 13.585 0.000 arrayprint.py:461(wrapper)
200000 1.526 0.000 12.521 0.000 arrayprint.py:478(_array2string)
10239 0.036 0.000 11.820 0.001 value_function.py:19(forward)
4050 0.035 0.000 11.761 0.003 vpg.py:140(compute_value_loss)
200050 9.654 0.000 9.654 0.000 {method 'logsumexp' of 'torch._C._TensorBase' objects}
200000 1.109 0.000 9.581 0.000 arrayprint.py:409(_get_format_function)
4050 0.030 0.000 9.405 0.002 _tensor.py:251(backward)
4050 0.059 0.000 9.374 0.002 __init__.py:69(backward)
4050 9.198 0.002 9.198 0.002 {method 'run_backward' of 'torch._C._EngineBase' objects}
420578 0.297 0.000 9.014 0.000 activation.py:348(forward)
420578 8.717 0.000 8.717 0.000 {built-in method tanh}
400100 2.238 0.000 8.685 0.000 constraints.py:201(check)
200000 1.637 0.000 8.313 0.000 categorical.py:108(sample)
200000 0.246 0.000 7.645 0.000 arrayprint.py:365(<lambda>)
200000 1.711 0.000 7.399 0.000 arrayprint.py:1124(__init__)
425319/419130 0.758 0.000 5.359 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
400100 0.940 0.000 3.960 0.000 utils.py:106(__get__)
800250 3.944 0.000 3.944 0.000 {method 'all' of 'torch._C._TensorBase' objects}
200050 0.296 0.000 3.704 0.000 <__array_function__ internals>:2(amax)
400100 1.179 0.000 3.520 0.000 fromnumeric.py:70(_wrapreduction)
800100 3.126 0.000 3.126 0.000 {method 'reshape' of 'torch._C._TensorBase' objects}
200050 0.443 0.000 3.107 0.000 fromnumeric.py:2589(amax)
4050 0.095 0.000 2.317 0.001 optimizer.py:83(wrapper)
1411/9 0.011 0.000 2.187 0.243 <frozen importlib._bootstrap>:966(_find_and_load)
1411/9 0.013 0.000 2.187 0.243 <frozen importlib._bootstrap>:936(_find_and_load_unlocked)
1360/8 0.013 0.000 2.182 0.273 <frozen importlib._bootstrap>:651(_load_unlocked)
1234/8 0.004 0.000 2.182 0.273 <frozen importlib._bootstrap_external>:672(exec_module)
1857/9 0.002 0.000 2.181 0.242 <frozen importlib._bootstrap>:211(_call_with_frames_removed)
200050 0.140 0.000 2.074 0.000 categorical.py:92(probs)
400403 2.016 0.000 2.016 0.000 {method 'reduce' of 'numpy.ufunc' objects}
4050 0.043 0.000 1.985 0.000 grad_mode.py:25(decorate_context)
200050 0.259 0.000 1.935 0.000 utils.py:65(logits_to_probs)
400100 1.891 0.000 1.891 0.000 constraints.py:300(check)
4050 0.169 0.000 1.838 0.000 adam.py:81(step)
115 0.003 0.000 1.817 0.016 __init__.py:1(<module>)
625352 1.677 0.000 1.677 0.000 {built-in method numpy.array}
200050 0.274 0.000 1.676 0.000 functional.py:1650(softmax)
200000 1.626 0.000 1.626 0.000 {built-in method multinomial}
4050 0.463 0.000 1.574 0.000 _functional.py:54(adam)
200000 0.703 0.000 1.515 0.000 arrayprint.py:60(_make_options_dict)
200050 0.180 0.000 1.483 0.000 <__array_function__ internals>:2(amin)
9 0.000 0.000 1.431 0.159 __init__.py:3(<module>)
687/94 0.002 0.000 1.389 0.015 {built-in method builtins.__import__}
200050 1.358 0.000 1.358 0.000 {method 'softmax' of 'torch._C._TensorBase' objects}
5574/1913 0.006 0.000 1.163 0.001 <frozen importlib._bootstrap>:997(_handle_fromlist)
1682460 1.126 0.000 1.126 0.000 module.py:1164(__getattr__)
200050 0.262 0.000 1.117 0.000 fromnumeric.py:2714(amin)
1892601 1.073 0.000 1.073 0.000 {built-in method torch._C._get_tracing_state}
200000 0.350 0.000 1.066 0.000 arrayprint.py:709(_formatArray)
651886 0.391 0.000 1.046 0.000 {built-in method builtins.getattr}
3 0.002 0.001 1.011 0.337 __init__.py:10(<module>)
200000 0.771 0.000 0.907 0.000 discrete.py:22(contains)
206389 0.865 0.000 0.865 0.000 {built-in method from_numpy}
204162 0.418 0.000 0.826 0.000 grad_mode.py:124(__enter__)
204191 0.691 0.000 0.757 0.000 grad_mode.py:119(__init__)
408324 0.493 0.000 0.734 0.000 grad_mode.py:213(__init__)
204162 0.322 0.000 0.733 0.000 grad_mode.py:128(__exit__)
200700 0.726 0.000 0.726 0.000 {method 'detach' of 'torch._C._TensorBase' objects}
200000 0.441 0.000 0.716 0.000 arrayprint.py:718(recurser)
1083524 0.698 0.000 0.698 0.000 {method 'append' of 'list' objects}
200000 0.681 0.000 0.681 0.000 arrayprint.py:358(_get_formatdict)
1 0.000 0.000 0.678 0.678 ddpg.py:1(<module>)
1 0.000 0.000 0.676 0.676 off_policy_algorithm.py:1(<module>)
1360/1342 0.004 0.000 0.569 0.000 <frozen importlib._bootstrap>:564(module_from_spec)
200000 0.534 0.000 0.534 0.000 arrayprint.py:1304(dtype_is_implied)
200032 0.525 0.000 0.525 0.000 {built-in method builtins.locals}
1504450/1504244 0.494 0.000 0.497 0.000 {built-in method builtins.isinstance}
11 0.000 0.000 0.492 0.045 utils.py:1(<module>)
4050 0.056 0.000 0.464 0.000 functional.py:3084(mse_loss)
6189 0.007 0.000 0.463 0.000 <__array_function__ internals>:2(stack)
1 0.000 0.000 0.458 0.458 __init__.py:288(<module>)
206389 0.458 0.000 0.458 0.000 {method 'float' of 'torch._C._TensorBase' objects}
Overall, the total time reduces by about 20 seconds (from 146 to 125) in the new one comparing the above two implementations with the same parameters.
# current
1 0.000 0.000 146.654 146.654 test_vpg.py:1(<module>)
---
# new
1 0.001 0.001 125.184 125.184 test_vpg.py:1(<module>)
While the number of function call and execution time of policy's forward function is the same, both quantities of value function decline from 204100 to 10239 and from 30 to 11.
# current
200050 0.856 0.000 50.446 0.000 categorical_policy.py:19(forward)
204100 0.498 0.000 30.005 0.000 value_function.py:19(forward)
---
# new
200050 0.878 0.000 51.720 0.000 categorical_policy.py:19(forward)
10239 0.036 0.000 11.820 0.001 value_function.py:19(forward)
So the new implementation speeds up because of the reduction of the value function calls significantly.
Parameters epochs: 50 steps per epoch: 4000 algorithm: VPG env: Cartpole seed: 0 compute: Colab CPU
In the above performance comparison, the current implementation is about 50s slower than others. With the above analysis, I can presume that the bottleneck is calling value function a lot.
I will choose the implementation of collecting experiences separated by each episode with numpy because of the following reasons;
Todos
I found my new implementation doesn't correctly handle the last reward in the end of the epoch.
Increasing the number of value function calls, total execution time increased.
Implementations comparison
epochs: 200 steps per epoch: 4000 algorithm: VPG env: Cartpole seed: 0 compute: Colab CPU
Implementation | 1 | 2 | 3 |
---|---|---|---|
current | Epoch: 199 Total steps: 8e+05 Total episodes: 1.1e+04 Average Episode Return: 186 Episode Return STD: 25.4 Max Episode Return: 200 Min Episode Return: 107 Average Episode Length: 182 Time: 487 Policy Loss: -0.00292 Avarage Entropy: 0.555 Log Prob STD: 0.447 Value Function Loss: 245 |
Epoch: 199 Total steps: 8e+05 Total episodes: 9.78e+03 Average Episode Return: 193 Episode Return STD: 20.5 Max Episode Return: 200 Min Episode Return: 130 Average Episode Length: 190 Time: 486 Policy Loss: -0.0115 Avarage Entropy: 0.55 Log Prob STD: 0.463 Value Function Loss: 144 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.13e+04 Average Episode Return: 194 Episode Return STD: 20.8 Max Episode Return: 200 Min Episode Return: 106 Average Episode Length: 190 Time: 479 Policy Loss: -0.00688 Avarage Entropy: 0.549 Log Prob STD: 0.45 Value Function Loss: 256 |
experience per episode with numpy (last value) |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.39e+04 Average Episode Return: 182 Episode Return STD: 31.1 Max Episode Return: 200 Min Episode Return: 103 Average Episode Length: 182 Time: 467 Policy Loss: -0.0209 Avarage Entropy: 0.564 Log Prob STD: 0.433 Value Function Loss: 286 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.28e+04 Average Episode Return: 190 Episode Return STD: 27.1 Max Episode Return: 200 Min Episode Return: 103 Average Episode Length: 190 Time: 477 Policy Loss: -0.0178 Avarage Entropy: 0.556 Log Prob STD: 0.443 Value Function Loss: 222 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.23e+04 Average Episode Return: 182 Episode Return STD: 38.7 Max Episode Return: 200 Min Episode Return: 56 Average Episode Length: 182 Time: 476 Policy Loss: -0.0177 Avarage Entropy: 0.558 Log Prob STD: 0.442 Value Function Loss: 166 |
Todos
Performance comparison with the current version
epochs: 200 steps per epoch: 4000 algorithm: VPG env: Cartpole seed: 0 compute: Colab CPU
Total execution time is improved because the number of value function call is reduced after each trajectory ended.
Implementation | 1 | 2 | 3 |
---|---|---|---|
current | Epoch: 199 Total steps: 8e+05 Total episodes: 1.39e+04 Average Episode Return: 182 Episode Return STD: 31.1 Max Episode Return: 200 Min Episode Return: 103 Average Episode Length: 182 Time: 467 Policy Loss: -0.0209 Avarage Entropy: 0.564 Log Prob STD: 0.433 Value Function Loss: 286 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.28e+04 Average Episode Return: 190 Episode Return STD: 27.1 Max Episode Return: 200 Min Episode Return: 103 Average Episode Length: 190 Time: 477 Policy Loss: -0.0178 Avarage Entropy: 0.556 Log Prob STD: 0.443 Value Function Loss: 222 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.23e+04 Average Episode Return: 182 Episode Return STD: 38.7 Max Episode Return: 200 Min Episode Return: 56 Average Episode Length: 182 Time: 476 Policy Loss: -0.0177 Avarage Entropy: 0.558 Log Prob STD: 0.442 Value Function Loss: 166 |
last reward bootstrapping on each model |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.13e+04 Average Episode Return: 190 Episode Return STD: 34.7 Max Episode Return: 200 Min Episode Return: 37 Average Episode Length: 190 Time: 416 Policy Loss: -0.0228 Avarage Entropy: 0.554 Log Prob STD: 0.46 Value Function Loss: 360 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.08e+04 Average Episode Return: 182 Episode Return STD: 32.6 Max Episode Return: 200 Min Episode Return: 92 Average Episode Length: 182 Time: 416 Policy Loss: -0.0115 Avarage Entropy: 0.542 Log Prob STD: 0.466 Value Function Loss: 219 |
Epoch: 199 Total steps: 8e+05 Total episodes: 1.12e+04 Average Episode Return: 182 Episode Return STD: 37.2 Max Episode Return: 200 Min Episode Return: 58 Average Episode Length: 182 Time: 418 Policy Loss: -0.00876 Avarage Entropy: 0.552 Log Prob STD: 0.451 Value Function Loss: 277 |
Todos
Refer https://github.com/yamatokataoka/reinforcement-learning-replications/issues/59