spragunr / deep_q_rl

Theano-based implementation of Deep Q-learning
BSD 3-Clause "New" or "Revised" License
1.08k stars 348 forks source link

UnusedInputError #58

Open uniwf2016 opened 8 years ago

uniwf2016 commented 8 years ago

Hi,

Thanks for reading this post.

Currently, I am trying to create my own network for reinforcement learning. To this end, I have adapted the Q network from Playing Atari with Deep Reinforcement Learning Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller and Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533.

When theano tries to compile function for loss and q_val (),

self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens_train) self._q_vals = theano.function([], q_vals, givens=givens_q_val)

it keeps returning UnusedInputError: theano.function was asked to create a function computing outputs given certain inputs, but the provided input variable at index 0 is not part of the computational graph needed to compute the outputs: <CudaNdarrayType(float32, 4D)>. To make this error into a warning, you can pass the parameter on_unused_input='warn' to theano.function. To disable it completely, use on_unused_input='ignore'.

I have been debugging the code many many times, but I cannot understand why the inputs (from givens) are not used as part of the function/ calculation.

Many thanks in advance for your explanation.

Here is my full source code for the network:

""" import lasagne import numpy as np import theano import theano.tensor as T from updates import deepmind_rmsprop import logging

class DeepQLearner: """ Deep Q-learning network using Lasagne. """ def init(self, width_img, height_img, width_loc, height_loc, width_his, height_his, target_dis_size, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=8.0):

    self.width_img = width_img
    self.height_img = height_img
    self.width_loc = width_loc
    self.height_loc = height_loc
    self.width_his = width_his
    self.height_his = height_his
    self.target_dis_size = target_dis_size

    self.num_actions = num_actions
    self.num_frames = num_frames
    self.batch_size = batch_size
    self.discount = discount
    self.rho = rho
    self.lr = learning_rate
    self.rms_epsilon = rms_epsilon
    self.momentum = momentum
    self.clip_delta = clip_delta
    self.freeze_interval = freeze_interval
    self.rng = rng

    self.logger = logging.getLogger(__name__)
    if not getattr(self.logger, 'handler_set', None):

        self.logger.setLevel(logging.DEBUG)
        # create a file handler

        handler = logging.FileHandler('toy.log', mode='a')
        handler.setLevel(logging.DEBUG)

        # create a logging format

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)

        # add the handlers to the logger

        self.logger.addHandler(handler)
        self.logger.handler_set = True
    self.logger.info('initialise a Q network.')

    lasagne.random.set_rng(self.rng)

    self.update_counter = 0

    self.l_out = self.build_network(network_type, num_actions, num_frames, batch_size)
    if self.freeze_interval > 0:
        self.next_l_out = self.build_network(network_type, num_actions,
                                             num_frames, batch_size)
        self.reset_q_hat()

    #states = T.tensor4('states')
    #next_states = T.tensor4('next_states')
    imgs = T.tensor4('imgs')
    next_imgs = T.tensor4('next_imgs')
    locs = T.tensor4('locs')
    next_locs = T.tensor4('next_locs')
    hiss = T.tensor4('hiss')
    next_hiss = T.tensor4('next_hiss')

target_distribution = T.tensor('target_distribution')

next_target_distribution = T.tensor('next_target_distribution')

    sds = T.icol('sds')
    next_sds  = T.icol('next_sds')

    rewards = T.col('rewards')
    actions = T.icol('actions')
    terminals = T.icol('terminals')

self.states_shared = theano.shared(

np.zeros((batch_size, num_frames, input_height, input_width),

dtype=theano.config.floatX))

self.next_states_shared = theano.shared(

np.zeros((batch_size, num_frames, input_height, input_width),

dtype=theano.config.floatX))

    self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames, width_img, height_img),
                     dtype=theano.config.floatX))
    self.next_imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames, width_img, height_img),
                     dtype=theano.config.floatX))

    self.locs_shared = theano.shared(
    np.zeros((batch_size, num_frames, width_loc, height_loc),
             dtype=theano.config.floatX))
    self.next_locs_shared = theano.shared(
    np.zeros((batch_size, num_frames, width_loc, height_loc),
             dtype=theano.config.floatX))
    self.hiss_shared = theano.shared(
    np.zeros((batch_size, num_frames, width_his, height_his),
             dtype=theano.config.floatX))
    self.next_hiss_shared = theano.shared(
    np.zeros((batch_size, num_frames, width_his, height_his),
             dtype=theano.config.floatX))
    self.sds_shared = theano.shared(
        np.zeros((batch_size, 1), dtype='int32'),
        broadcastable=(False, True))
    self.next_sds_shared= theano.shared(
        np.zeros((batch_size, 1), dtype='int32'),
        broadcastable=(False, True))

    self.rewards_shared = theano.shared(
        np.zeros((batch_size, 1), dtype=theano.config.floatX),
        broadcastable=(False, True))

    self.actions_shared = theano.shared(
        np.zeros((batch_size, 1), dtype='int32'),
        broadcastable=(False, True))

    self.terminals_shared = theano.shared(
        np.zeros((batch_size, 1), dtype='int32'),
        broadcastable=(False, True))

q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)

    # massage/ unpack states into the right form for multi input network

    q_vals = lasagne.layers.get_output(self.l_out, {'l_in':imgs,
              'l_loc1':locs,  'l_his':hiss,
              'l_dis': sds})

    if self.freeze_interval > 0:

next_q_vals = lasagne.layers.get_output(self.next_l_out,

next_states / input_scale)

        next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                {'l_in':next_imgs,
                              'l_loc1':next_locs,  'l_his':next_hiss,
                              'l_dis': next_sds})
    else:
        next_q_vals = lasagne.layers.get_output(self.l_out,
                                                {'l_in':next_imgs,
                              'l_loc1':next_locs,  'l_his':next_hiss,
                              'l_dis': next_sds})

        next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

    target = (rewards +
              (T.ones_like(terminals) - terminals) *
              self.discount * T.max(next_q_vals, axis=1, keepdims=True))
    diff = target - q_vals[T.arange(batch_size),
                           actions.reshape((-1,))].reshape((-1, 1))

    if self.clip_delta > 0:
        # If we simply take the squared clipped diff as our loss,
        # then the gradient will be zero whenever the diff exceeds
        # the clip bounds. To avoid this, we extend the loss
        # linearly past the clip point to keep the gradient constant
        # in that regime.
        # 
        # This is equivalent to declaring d loss/d q_vals to be
        # equal to the clipped diff, then backpropagating from
        # there, which is what the DeepMind implementation does.
        quadratic_part = T.minimum(abs(diff), self.clip_delta)
        linear_part = abs(diff) - quadratic_part
        loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
    else:
        loss = 0.5 * diff ** 2

    if batch_accumulator == 'sum':
        loss = T.sum(loss)
    elif batch_accumulator == 'mean':
        loss = T.mean(loss)
    else:
        raise ValueError("Bad accumulator: {}".format(batch_accumulator))

    params = lasagne.layers.helper.get_all_params(self.l_out)  
    givens_train = {

states: self.states_shared,

next_states: self.next_states_shared,

        imgs :self.imgs_shared,
        next_imgs :self.next_imgs_shared, 
        locs: self.locs_shared ,
        next_locs :self.next_locs_shared,
        hiss :self.hiss_shared,
        next_hiss :self.next_hiss_shared ,
        sds : self.sds_shared ,
        next_sds : self.next_sds_shared ,
        rewards: self.rewards_shared,
        actions: self.actions_shared,
        terminals: self.terminals_shared
    }
    givens_q_val = {

states: self.states_shared,

next_states: self.next_states_shared,

        imgs :self.imgs_shared,
        locs: self.locs_shared ,
        hiss :self.hiss_shared,
        sds : self.sds_shared

rewards: self.rewards_shared,

actions: self.actions_shared,

terminals: self.terminals_shared

    }
    if update_rule == 'deepmind_rmsprop':
        updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                   self.rms_epsilon)
    elif update_rule == 'rmsprop':
        updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                          self.rms_epsilon)
    elif update_rule == 'sgd':
        updates = lasagne.updates.sgd(loss, params, self.lr)
    else:
        raise ValueError("Unrecognized update: {}".format(update_rule))

    if self.momentum > 0:
        updates = lasagne.updates.apply_momentum(updates, None,
                                                 self.momentum)

    self._train = theano.function([], [loss, q_vals], updates=updates,
                                  givens=givens_train)
    self._q_vals = theano.function([], q_vals,
                                   givens=givens_q_val)

def build_network(self, network_type, output_dim, num_frames, batch_size):
    if network_type == "myOwn":
        return self.build_myNetwork(output_dim, num_frames, batch_size)
    else:
        raise ValueError("Unrecognized network: {}".format(network_type))

def build_myNetwork(self, output_dim, num_frames, batch_size):

    from lasagne.layers import dnn
    l_in = lasagne.layers.InputLayer(
        shape=(batch_size, num_frames, self.width_img, self.height_img)
    )

    l_conv1 = dnn.Conv2DDNNLayer(
        l_in,
        num_filters=32,
        filter_size=(8, 8),
        stride=(3, 3),
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.HeUniform(),
        b=lasagne.init.Constant(.1)
    )

    l_conv2 = dnn.Conv2DDNNLayer(
        l_conv1,
        num_filters=64,
        filter_size=(4, 4),
        stride=(1, 1),
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.HeUniform(),
        b=lasagne.init.Constant(.1)
    )

    l_conv3 = dnn.Conv2DDNNLayer(
        l_conv2,
        num_filters=64,
        filter_size=(3, 3),
        stride=(1, 1),
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.HeUniform(),
        b=lasagne.init.Constant(.1)
    )

    l_loc1 = lasagne.layers.InputLayer(
        shape=(batch_size, num_frames, self.width_loc, self.height_loc)
    )

    n = 64
    l_loc2 = lasagne.layers.DenseLayer(l_loc1, num_units=n)

    #history = np.zeros((batch_size, num_frames, 4, 24*24), dtype=int)

    l_his = lasagne.layers.InputLayer(
        shape=(batch_size, num_frames, self.width_his, self.height_his)
    )

    l_his2 = lasagne.layers.DenseLayer(l_his, num_units=n)

    l_dis =  lasagne.layers.InputLayer(
        shape=(batch_size, num_frames, self.target_dis_size)
    )

    l_dis2 =  lasagne.layers.DenseLayer(l_dis, num_units=n)

    l_conv4 = lasagne.layers.ReshapeLayer(l_conv3, (batch_size, 1, -1))
    l_loc2 = lasagne.layers.ReshapeLayer(l_loc2, (batch_size,1,-1))
    l_his2 = lasagne.layers.ReshapeLayer(l_his2, (batch_size,1,-1))
    l_dis2 = lasagne.layers.ReshapeLayer(l_dis2, (batch_size,1,-1))
    l_merge = lasagne.layers.ElemwiseSumLayer((l_conv4,l_loc2, l_his2, l_dis2 ))

    print (l_conv4.output_shape)
    print l_loc2.output_shape
    print l_his2.output_shape
    print l_dis2.output_shape
    print l_merge.output_shape

    l_hidden1 = lasagne.layers.DenseLayer(
        l_merge,
        num_units=320,
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.HeUniform(),
        b=lasagne.init.Constant(.1)
    )
    #
    l_out = lasagne.layers.DenseLayer(
        l_hidden1,
        num_units=output_dim,
        nonlinearity=None,
        W=lasagne.init.HeUniform(),
        b=lasagne.init.Constant(.1)
    )
    return l_out
def train(self,  imgs ,next_imgs ,  locs,  next_locs ,hiss,
                 next_hiss, sds ,
                 next_sds,
                    actions, rewards, terminals):
    """
    Train one batch.

    Arguments:

    states - b x f x h x w numpy array, where b is batch size,
             f is num frames, h is height and w is width.
    actions - b x 1 numpy array of integers
    rewards - b x 1 numpy array
    next_states - b x f x h x w numpy array
    terminals - b x 1 numpy boolean array (currently ignored)

    Returns: average loss
    """
    self.imgs_shared.set_value(imgs)
    self.next_imgs_shared.set_value(next_imgs)
    self.locs_shared.set_value(locs)
    self.next_locs_share.set_value(next_locs)
    self.hiss_shared.set_value(hiss)
    self.next_hiss_shared.set_value(next_hiss)
    self.sds_shared.set_value(sds)
    self.next_sds_shared.set_value(next_sds)

self.states_shared.set_value(states)

self.next_states_shared.set_value(next_states)

    self.actions_shared.set_value(actions)
    self.rewards_shared.set_value(rewards)
    self.terminals_shared.set_value(terminals)
    if (self.freeze_interval > 0 and
        self.update_counter % self.freeze_interval == 0):
        self.reset_q_hat()
    loss, _ = self._train()
    self.update_counter += 1
    return np.sqrt(loss)

def q_vals(self, img , loc,  his, sd):

states = np.zeros((self.batch_size, self.num_frames, self.input_height,

self.input_width), dtype=theano.config.floatX)

states[0, ...] = state

self.states_shared.set_value(states)

    imgs = np.zeros((self.batch_size, self.num_frames, self.height_img, 
                     self.width_img), dtype=theano.config.floatX)
    imgs[0, ...] = img
    locs = np.zeros((self.batch_size, self.num_frames, self.height_loc, 
                     self.width_loc), dtype=theano.config.floatX)
    locs[0, ...] = loc

    hiss = np.zeros((self.batch_size, self.num_frames, self.height_his, 
                     self.width_his), dtype=theano.config.floatX)

    hiss[0,...] = his

    sds = np.zeros((self.batch_size, self.num_frames, self.target_dis_size),
                    dtype='int32')

    sds[0, ...] = sd

    self.imgs_shared.set_value(imgs)
    self.locs_shared.set_value(locs)
    self.hiss_shared.set_value(hiss)
    self.sds_shared.set_value(sds)

    return self._q_vals()[0]

def choose_action(self, img , loc,  his, sd, epsilon):
    if self.rng.rand() < epsilon:
        return self.rng.randint(0, self.num_actions)
    q_vals = self.q_vals(img , loc,  his, sd)
    return np.argmax(q_vals)

def reset_q_hat(self):
    all_params = lasagne.layers.helper.get_all_param_values(self.l_out)
    lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params)

def main():

net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000,

32, 'nature_cuda')

width_img = 24
height_img = 24
width_loc = 1
height_loc = 3
width_his = width_img *height_img
height_his = 4
target_dis_size = 1
num_actions = 9
num_frames = 1
discount = 0.99
learning_rate = .00025
rho = 0.95
rms_epsilon = 0.95
momentum = 0.95
clip_delta = 1
freeze_interval = 100
batch_size = 100
network_type = 'myOwn'
update_rule = 'deepmind_rmsprop' 
batch_accumulator ='sum'
rng = np.random.RandomState(123456)

net = DeepQLearner(width_img,
             height_img,
             width_loc,
             height_loc,
             width_his,
             height_his,
             target_dis_size,
            num_actions, num_frames, discount, learning_rate, rho,
             rms_epsilon, momentum, clip_delta, freeze_interval,
             batch_size, network_type, update_rule,
             batch_accumulator, rng)

if name == 'main': main()