openai / gym

A toolkit for developing and comparing reinforcement learning algorithms.
https://www.gymlibrary.dev
Other
34.8k stars 8.61k forks source link

Confused how the Monitor works #503

Closed vishiswoz closed 7 years ago

vishiswoz commented 7 years ago

This is my current code with monitoring, however, the while loop will never terminate, avg_reward will be stuck between 60 and 70.

import gym
from pybrain.structure import FeedForwardNetwork
from pybrain.structure import LinearLayer, SigmoidLayer
from pybrain.structure import FullConnection
from pybrain.datasets import SupervisedDataSet
from pybrain.structure import BiasUnit
from pybrain.supervised.trainers import BackpropTrainer
import time
import numpy as np
import copy
import random
import operator
from gym import wrappers

poolSize = 14;
pool = [];
elitismSize = 4;
crossRate = 0.9;
mutRate = 0.2;

def newNet(inLen, outLen):
    n = FeedForwardNetwork()

    bias = BiasUnit(name='bias')
    n.addModule(bias)

    inLayer = LinearLayer(inLen, name='in')
    hiddenLayer = SigmoidLayer(10, name='hidden')
    outLayer = LinearLayer(outLen, name='out')

    n.addInputModule(inLayer)
    n.addModule(hiddenLayer)
    n.addOutputModule(outLayer)

    in_to_hidden = FullConnection(inLayer, hiddenLayer)
    bias_to_hidden = FullConnection(bias, hiddenLayer)
    hidden_to_out = FullConnection(hiddenLayer, outLayer)

    n.addConnection(in_to_hidden)
    n.addConnection(bias_to_hidden)
    n.addConnection(hidden_to_out)

    n.sortModules()
    return n;

def crossOver(parent1, parent2):
    global crossRate;
    if random.random() <= crossRate:
        child1 = copy.deepcopy(parent1);
        child2 = copy.deepcopy(parent2);

        temp1 = parent1.connections[parent1['bias']][0].params[:];
        temp2 = parent2.connections[parent2['bias']][0].params[:];

        cutLocation = random.randint(0, len(temp1));

        param1 = np.concatenate((temp1[0:cutLocation], temp2[cutLocation:]));
        param2 = np.concatenate((temp2[0:cutLocation], temp1[cutLocation:]));

        child1.connections[child1['bias']][0]._setParameters(param1, child1.connections[child1['bias']][0].owner);
        child2.connections[child2['bias']][0]._setParameters(param2, child2.connections[child2['bias']][0].owner);

        return child1, child2;
    return parent1, parent2;

def mutate(parent):
    global mutRate;
    if random.random() <= mutRate:
        x = parent.connections[parent['bias']][0].params;
        for _ in xrange(len(x)):
            x[_] += x[_] * (random.random() - 0.5) * 3 + (random.random() - 0.5);
        y = parent.connections[parent['in']][0].params;
        for _ in xrange(len(y)):
            y[_] += y[_] * (random.random() - 0.5) * 3 + (random.random() - 0.5);
        z = parent.connections[parent['hidden']][0].params;
        for _ in xrange(len(z)):
            z[_] += z[_] * (random.random() - 0.5) * 3 + (random.random() - 0.5);

def selection(pool):
    total = 0.0;
    for chromo in pool:
        total = total + chromo.fitness;
    sli = float(total) * random.random();
    ttt = 0.0;
    for xxx in xrange(len(pool)-1, -1, -1):
        ttt = ttt + pool[xxx].fitness;
        if ttt >= sli:
            #remove this one and return it;
            return pool.pop(xxx);
    return pool.pop();

'''
nn =  newNet(4, 2);
print nn['in'];
print nn['hidden'];
print nn['out'];
print nn.connections[nn['bias']]
print type(nn.connections[nn['bias']][0].params);
xxx = np.concatenate((nn.connections[nn['bias']][0].params[0:3], nn.connections[nn['bias']][0].params[3:]));
print xxx;
vvv = np.array([0]*len(nn.connections[nn['bias']][0].params));
for _ in range(len(nn.connections[nn['bias']][0].params)):
    nn.connections[nn['bias']][0].params[_] = 1;
ooo = nn.connections[nn['bias']][0].owner;
nn.connections[nn['bias']][0]._setParameters(vvv, ooo);
print nn.connections[nn['bias']][0].params;
'''

def initList():
    global pool;
    for _ in xrange(poolSize):
        genome = newNet(len(env.observation_space.high), env.action_space.n);
        pool.append(genome);

env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1', force=True)
maxxy = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
episode_num = 0;
first = True;
cum_reward = [];
avg_reward = float(0);
gen_num = 0;

if first:
    initList();
    first = False;

while True:
    newpool = [];
    print "Testing Generation #" + str(gen_num);
    for _ in xrange(poolSize):
        genome = pool[_];
        ep_reward = 0.0;
        observation = env.reset()
        for __ in xrange(maxxy):
            #env.render()
            result = genome.activate(observation)
            action = np.argmax(result)
            #print(observation)
            #action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            ep_reward += reward;
            if done:
                #print("Episode finished after {} timesteps".format(t+1))
                episode_num += 1;
                break
        cum_reward.append(ep_reward);
        genome.fitness = ep_reward;

    if len(cum_reward) >= 100:
        cum_reward = cum_reward[-100:];
        avg_reward = np.mean(cum_reward[:]);
        print "Average Award:", avg_reward;

    if avg_reward >= 195.0:
        #we did it;
        print("Success achieved after {} episodes".format(episode_num)); 
        break;

    #all gens done create new gen
    sortedpop = sorted(pool, key=lambda x: x.fitness, reverse=True);
    elite = sortedpop[0:elitismSize];
    rest = sortedpop[elitismSize:];

    for m in xrange(len(rest)-1,-1,-2):
        p1 = selection(rest);
        p2 = selection(rest);
        ch1, ch2 = crossOver(p1, p2);
        mutate(ch1);
        mutate(ch2);
        newpool.append(ch1);
        newpool.append(ch2);

    pool = [];
    pool = reduce(operator.add, [elite, newpool]);
    gen_num += 1;

gym.upload('/tmp/cartpole-experiment-1', api_key='XXX')

However, if I comment out the lines

env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1', force=True) gym.upload('/tmp/cartpole-experiment-1', api_key='sk_8lZfIkSRLSgKfmIAUomg')

which basically remove the monitor, my while loop terminates in about 4500-5000 episodes, yet with the monitor my while loop will not terminate at those episodes, what exactly is the issue?

tlbtlbtlb commented 7 years ago

Can you figure out how, for the same genome, how the monitored version behaves differently? Does it return done=True earlier, or different observations, or different rewards?

vishiswoz commented 7 years ago

I think I figured it out: I was making the timestep equal to 1000 earlier, that's why it behaved differently: it took a lot longer for the desired average fitness of 195 to come when I changed the timestep to 200.

vishiswoz commented 7 years ago

Any ideas how to make the GA faster if I use a smaller timestep? Cause it's taking a very long time.

tlbtlbtlb commented 7 years ago

GAs parallelize very well. You can run each member of the population on a separate core for a 14x speedup. Python's processing module can do this on a single machine.

vishiswoz commented 7 years ago

Thanks for the tip, I actually figured out why it was taking so long, I was implementing the selection criterion wrong. Once I fixed it, my code went from running for 3 hours and getting an average fitness of 70 to getting the desired average fitness of 195.0 after 30-60 seconds. You can view it here: https://gym.openai.com/evaluations/eval_vNeUzkELQWyqq1zBaL9KAg. I'll try and upload the code as a gist in the next couple of hours.