PacktPublishing / Deep-Reinforcement-Learning-Hands-On-Second-Edition

Deep-Reinforcement-Learning-Hands-On-Second-Edition, published by Packt
MIT License
1.12k stars 529 forks source link

For CartPole in Chapter 4 This is the code that worked for me. #71

Open jupitermarketingagency opened 9 months ago

jupitermarketingagency commented 9 months ago

!/usr/bin/env python3

import gymnasium as gym from collections import namedtuple import numpy as np from tensorboardX import SummaryWriter

import torch import torch.nn as nn import torch.optim as optim


class Net(nn.Module): def init(self, obs_size, hidden_size, n_actions): super(Net, self).init() = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, n_actions) )

def forward(self, x):

Episode = namedtuple('Episode', field_names=['reward', 'steps']) EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size): batch = [] episode_reward = 0.0 episodesteps = [] obs, = env.reset() env.render() sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]) act_probs_v = sm(net(obs_v)) act_probs =[0] action = np.random.choice(len(act_probs), p=act_probs) next_obs, reward, isdone, , _ = env.step(action) episode_reward += reward step = EpisodeStep(observation=obs, action=action) episode_steps.append(step)

    if is_done:
        e = Episode(reward=episode_reward, steps=episode_steps)
        episode_reward = 0.0
        episode_steps = []
        next_obs, _ = env.reset()
        if len(batch) == batch_size:
            yield batch
            batch = []
    obs = next_obs

def filter_batch(batch, percentile): rewards = list(map(lambda s: s.reward, batch)) reward_bound = np.percentile(rewards, percentile) reward_mean = float(np.mean(rewards))

train_obs = []
train_act = []
for reward, steps in batch:
    if reward < reward_bound:
    train_obs.extend(map(lambda step: step.observation, steps))
    train_act.extend(map(lambda step: step.action, steps))

train_obs_v = torch.FloatTensor(train_obs)
train_act_v = torch.LongTensor(train_act)
return train_obs_v, train_act_v, reward_bound, reward_mean

if name == "main": env = gym.make("CartPole-v1", render_mode='human')

env = gym.wrappers.Monitor(env, directory="mon", force=True)

obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-cartpole")

for iter_no, batch in enumerate(iterate_batches(
        env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = \
        filter_batch(batch, PERCENTILE)
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
        iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 199:
dkinneyBU commented 8 months ago

@jupitermarketingagency OH MY GOOOOOOODDDD! Thank you for this, I've been fighting this stupid program for THREE DAYS! This guy really needs to revisit this code, I've had to debug basically all of it with a few rare exceptions. And this is only Chapter 4!!!!

You are a life saver, if you conjure up any more fixes please post--I will be internally grateful. :-)

jupitermarketingagency commented 8 months ago

@dkinneyBU Glad to hear that was of help to you. Yes, agree with you about him revisiting this code. So far from all the RL courses we've seen this happen over and over again because the books are more than 2 years old. So we've been trying to only focus on books that have been recently published.

MFKruger commented 7 months ago

Thank you very much! Being a newbie to PyTorch and DRL, you saved me a lot time!