PacktPublishing / Deep-Reinforcement-Learning-Hands-On

Hands-on Deep Reinforcement Learning, published by Packt
MIT License
2.83k stars 1.28k forks source link

Converting to support GPU #35

Closed icompute386 closed 5 years ago

icompute386 commented 5 years ago

Hi, I've got a question regarding the code in chapter 3 of (Deep Reinforcement Learning Hands-On). Can you explain how to make this run on the GPU? I've tried to implement this myself but the code crashes.

Crashed with the error:

(python36) c:\Anaconda\Deep-Reinforcement-Learning-Hands-On-master\Chapter04>python 03_frozenlake_tweaked.py --cuda Traceback (most recent call last): File "03_frozenlake_tweaked.py", line 109, in for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)): File "03_frozenlake_tweaked.py", line 58, in iterate_batches act_probs_v = sm(net(obs_v)) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\modules\module.py", line 489, in call result = self.forward(*input, kwargs) File "03_frozenlake_tweaked.py", line 43, in forward return self.net(x) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\modules\module.py", line 489, in call result = self.forward(*input, *kwargs) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\modules\container.py", line 92, in forward input = module(input) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\modules\module.py", line 489, in call result = self.forward(input, kwargs) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\modules\linear.py", line 67, in forward return F.linear(input, self.weight, self.bias) File "C:\Anaconda\envs\python36\lib\site-packages\torch\nn\functional.py", line 1352, in linear ret = torch.addmm(torch.jit._unwrap_optional(bias), input, weight.t()) RuntimeError: Expected object of backend CUDA but got backend CPU for argument #4 'mat1'

Added/Made the following changes to: 03_frozenlake_tweaked.py

!/usr/bin/env python3

import random import gym import gym.spaces import argparse from collections import namedtuple import numpy as np from tensorboardX import SummaryWriter

import torch import torch.nn as nn import torch.optim as optim

HIDDEN_SIZE = 128 BATCH_SIZE = 100 PERCENTILE = 30 GAMMA = 0.9

class DiscreteOneHotWrapper(gym.ObservationWrapper): def init(self, env): super(DiscreteOneHotWrapper, self).init(env) assert isinstance(env.observation_space, gym.spaces.Discrete) self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

def observation(self, observation):
    res = np.copy(self.observation_space.low)
    res[observation] = 1.0
    return res

class Net(nn.Module): def init(self, obs_size, hidden_size, n_actions): super(Net, self).init() self.net = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, n_actions) )

def forward(self, x):
    return self.net(x)

Episode = namedtuple('Episode', field_names=['reward', 'steps']) EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size): batch = [] episode_reward = 0.0 episode_steps = [] obs = env.reset() sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]) act_probs_v = sm(net(obs_v)) act_probs = act_probs_v.data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, isdone, = env.step(action) episode_reward += reward episode_steps.append(EpisodeStep(observation=obs, action=action)) if is_done: batch.append(Episode(reward=episode_reward, steps=episode_steps)) episode_reward = 0.0 episode_steps = [] next_obs = env.reset() if len(batch) == batch_size: yield batch batch = [] obs = next_obs

def filter_batch(batch, percentile): disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch)) reward_bound = np.percentile(disc_rewards, percentile)

train_obs = []
train_act = []
elite_batch = []
for example, discounted_reward in zip(batch, disc_rewards):
    if discounted_reward > reward_bound:
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
        elite_batch.append(example)

return elite_batch, train_obs, train_act, reward_bound

if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help="Enable cuda computation") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu")

random.seed(12345)
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions).to(device)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-tweaked")

full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
    if not full_batch:
        continue
    #obs_v = torch.FloatTensor(obs)#, device=device)
    #acts_v = torch.LongTensor(acts)#, device=device)
    obs_v = torch.tensor(obs).to(device)
    acts_v = torch.tensor(acts).to(device)

    full_batch = full_batch[-500:]

    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
        iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)
    if reward_mean > 0.8:
        print("Solved!")
        break
writer.close()
Shmuma commented 5 years ago

Hi!

In function iterate_batches, tensor obs_v is still on CPU, you need to copy it to GPU backend with

obs_v = torch.FloatTensor([obs]).to(device)
icompute386 commented 5 years ago

Hi Shmuma, that did the trick. I was anticipating a performance boost, but didn't see an improvement. Should that be expected here?

Chris

!/usr/bin/env python3

import random import gym import gym.spaces import argparse from collections import namedtuple import numpy as np from tensorboardX import SummaryWriter

import torch import torch.nn as nn import torch.optim as optim

HIDDEN_SIZE = 128 BATCH_SIZE = 100 PERCENTILE = 30 GAMMA = 0.9

class DiscreteOneHotWrapper(gym.ObservationWrapper): def init(self, env): super(DiscreteOneHotWrapper, self).init(env) assert isinstance(env.observation_space, gym.spaces.Discrete) self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

def observation(self, observation):
    res = np.copy(self.observation_space.low)
    res[observation] = 1.0
    return res

class Net(nn.Module): def init(self, obs_size, hidden_size, n_actions): super(Net, self).init() self.net = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, n_actions) )

def forward(self, x):
    return self.net(x)

Episode = namedtuple('Episode', field_names=['reward', 'steps']) EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size): batch = [] episode_reward = 0.0 episode_steps = [] obs = env.reset() sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]).to(device) act_probs_v = sm(net(obs_v)).cpu() act_probs = act_probs_v.data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, isdone, = env.step(action) episode_reward += reward episode_steps.append(EpisodeStep(observation=obs, action=action)) if is_done: batch.append(Episode(reward=episode_reward, steps=episode_steps)) episode_reward = 0.0 episode_steps = [] next_obs = env.reset() if len(batch) == batch_size: yield batch batch = [] obs = next_obs

def filter_batch(batch, percentile): disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch)) reward_bound = np.percentile(disc_rewards, percentile)

train_obs = []
train_act = []
elite_batch = []
for example, discounted_reward in zip(batch, disc_rewards):
    if discounted_reward > reward_bound:
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
        elite_batch.append(example)

return elite_batch, train_obs, train_act, reward_bound

if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help="Enable cuda computation") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu")

random.seed(12345)
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions).to(device)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-tweaked")

full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
    if not full_batch:
        continue
    #obs_v = torch.FloatTensor(obs)#, device=device)
    #acts_v = torch.LongTensor(acts)#, device=device)
    obs_v = torch.tensor(obs).to(device)
    acts_v = torch.tensor(acts).to(device)

    full_batch = full_batch[-500:]

    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
        iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)
    if reward_mean > 0.8:
        print("Solved!")
        break
writer.close()
Shmuma commented 5 years ago

No, the net is too small to benefit from gpu parallelization

ср, 6 мар. 2019 г., 20:53 icompute386 notifications@github.com:

Hi Shmuma, that did the trick. I was anticipating a performance boost, but didn't see an improvement. Should that be expected here?

Chris

!/usr/bin/env python3

import random import gym import gym.spaces import argparse from collections import namedtuple import numpy as np from tensorboardX import SummaryWriter

import torch import torch.nn as nn import torch.optim as optim

HIDDEN_SIZE = 128 BATCH_SIZE = 100 PERCENTILE = 30 GAMMA = 0.9

class DiscreteOneHotWrapper(gym.ObservationWrapper): def init(self, env): super(DiscreteOneHotWrapper, self).init(env) assert isinstance(env.observation_space, gym.spaces.Discrete) self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

def observation(self, observation): res = np.copy(self.observation_space.low) res[observation] = 1.0 return res

class Net(nn.Module): def init(self, obs_size, hidden_size, n_actions): super(Net, self).init() self.net = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, n_actions) )

def forward(self, x): return self.net(x)

Episode = namedtuple('Episode', field_names=['reward', 'steps']) EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size): batch = [] episode_reward = 0.0 episode_steps = [] obs = env.reset() sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]).to(device) act_probs_v = sm(net(obs_v)).cpu() act_probs = act_probs_v.data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, isdone, = env.step(action) episode_reward += reward episode_steps.append(EpisodeStep(observation=obs, action=action)) if is_done: batch.append(Episode(reward=episode_reward, steps=episode_steps)) episode_reward = 0.0 episode_steps = [] next_obs = env.reset() if len(batch) == batch_size: yield batch batch = [] obs = next_obs

def filter_batch(batch, percentile): disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch)) reward_bound = np.percentile(disc_rewards, percentile)

train_obs = [] train_act = [] elite_batch = [] for example, discounted_reward in zip(batch, disc_rewards): if discounted_reward > reward_bound: train_obs.extend(map(lambda step: step.observation, example.steps)) train_act.extend(map(lambda step: step.action, example.steps)) elite_batch.append(example)

return elite_batch, train_obs, train_act, reward_bound

if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help="Enable cuda computation") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu")

random.seed(12345) env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

env = gym.wrappers.Monitor(env, directory="mon", force=True)

obs_size = env.observation_space.shape[0] n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions).to(device) objective = nn.CrossEntropyLoss() optimizer = optim.Adam(params=net.parameters(), lr=0.001) writer = SummaryWriter(comment="-frozenlake-tweaked")

full_batch = [] for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)): reward_mean = float(np.mean(list(map(lambda s: s.reward, batch)))) full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE) if not full_batch: continue

obs_v = torch.FloatTensor(obs)#, device=device)

#acts_v = torch.LongTensor(acts)#, device=device)
obs_v = torch.tensor(obs).to(device)
acts_v = torch.tensor(acts).to(device)

full_batch = full_batch[-500:]

optimizer.zero_grad()
action_scores_v = net(obs_v)
loss_v = objective(action_scores_v, acts_v)
loss_v.backward()
optimizer.step()
print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
    iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
writer.add_scalar("loss", loss_v.item(), iter_no)
writer.add_scalar("reward_mean", reward_mean, iter_no)
writer.add_scalar("reward_bound", reward_bound, iter_no)
if reward_mean > 0.8:
    print("Solved!")
    break

writer.close()

— You are receiving this because you modified the open/close state. Reply to this email directly, view it on GitHub https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/issues/35#issuecomment-470208448, or mute the thread https://github.com/notifications/unsubscribe-auth/AAECaj6mqJOmKXrVWcYmTmcHpCD3ePeJks5vUAChgaJpZM4bfjq7 .