Shmuma / ptan

PyTorch Agent Net: reinforcement learning toolkit for pytorch
MIT License
531 stars 165 forks source link

ExperienceReplayBuffer stores second-last transition twice #25

Closed Phlogiston90 closed 5 years ago

Phlogiston90 commented 5 years ago

Hi Maxim, first of all, thank you so much for the book! It helps me a lot for my thesis!

Second, I think that the ExperienceReplayBuffer stores the second-last transition twice, which could bias the training if an environment only has a few steps (like mine). Maybe I have overlooked something, but this is my minimal example showing the described behaviour:

import ptan
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym

EPSILON_START = 1.0
GAMMA = 1.0
REWARD_STEPS = 1
LEARNING_RATE = 0.001
MAX_STEPS = 20
REPLAY_SIZE = 10
MAX_STEPS_PER_EPISODE = 3
device = torch.device("cpu")

class Environment(gym.Env):
    def __init__(self):
        self.state = 0
        self.observation_space = gym.spaces.Discrete(5)
        self.action_space = gym.spaces.Discrete(2)

    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        self.state += 1
        if self.state == 4:
            done = True
        else:
            done = False
        reward = self.state
        return self.state, reward, done, None

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.n_actions = n_actions

    def forward(self, x):
        return torch.rand(1,self.n_actions)

env = Environment()
net = DQN(env.observation_space.shape, env.action_space.n).to(device)
selector = ptan.actions.EpsilonGreedyActionSelector(EPSILON_START)
agent = ptan.agent.DQNAgent(net, selector, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, GAMMA, steps_count=REWARD_STEPS)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, REPLAY_SIZE)

step_idx = 0

while step_idx < MAX_STEPS:
    step_idx += 1
    buffer.populate(1)
    new_rewards = exp_source.pop_rewards_steps()

    if new_rewards:
        print("episode over: step {}: (total_reward, steps) = {}".format(step_idx, new_rewards[0]))

print()
print(*buffer.buffer, sep='\n')

The output is:

episode over: step 6: (total_reward, steps) = (10.0, 4)
episode over: step 11: (total_reward, steps) = (10.0, 4)
episode over: step 16: (total_reward, steps) = (10.0, 4)

ExperienceFirstLast(state=0, action=0, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=0, reward=2.0, last_state=2)
ExperienceFirstLast(state=2, action=0, reward=3.0, last_state=3)
ExperienceFirstLast(state=2, action=0, reward=3.0, last_state=3)
ExperienceFirstLast(state=3, action=0, reward=4.0, last_state=None)
ExperienceFirstLast(state=0, action=0, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=0, reward=2.0, last_state=2)
ExperienceFirstLast(state=2, action=0, reward=3.0, last_state=3)
ExperienceFirstLast(state=2, action=0, reward=3.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=4.0, last_state=None)

Notice how the transition from state 2 to 3 is stored twice each time. I used the ptan version that you can get via pip today. Could you have a look into this? Best regards!

Shmuma commented 5 years ago

Hi!

Thanks for reporting! Will take a look on this.

Shmuma commented 5 years ago

That's indeed a bug, fixed in my dev branch https://github.com/Shmuma/ptan/tree/torch-1.0.1