DQN not learning - Githubissues

Hi everyone,

I am using the following DQN agent and it does not learning. Could you please let me know if i missed something.

Here is the DQN code:

`import numpy as np import tensorflow as tf import json import os import math

class DQN: def init(self, n_actions, n_features, lr= 0.001, reward_decay=0.9, e_greedy=0.9, epsilon_min=0.01, replace_target_iter=300, memory_size=10000, batch_size=32, e_greedy_decay=1e-5): self.n_actions = n_actions self.n_features = n_features self.lr = lr epsilon_max=0.9 self.gamma = reward_decay self.epsilon_decay = e_greedy_decay self.epsilon_min = epsilon_min self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.learn_step_counter = 0 self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) self.loss_history = [] self.reward_history = [] self.epsilon = 0 if self.epsilon_decay is not None else self.epsilon_max

    self.graph = tf.Graph()
    with self.graph.as_default():
        self.sess = tf.compat.v1.Session()
        self._build_net()
        self.sess.run(tf.compat.v1.global_variables_initializer())

def _build_net(self):
    self.s = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s')
    self.s_ = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s_')
    self.r = tf.compat.v1.placeholder(tf.float32, [None,], name='r')
    self.a = tf.compat.v1.placeholder(tf.int32, [None,], name='a')

    w_initializer = tf.random_normal_initializer(0., 0.3)
    b_initializer = tf.constant_initializer(0.1)

    with tf.compat.v1.variable_scope('eval_net'):
        e1 = tf.compat.v1.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
                             bias_initializer=b_initializer, name='e1')
        self.q_eval = tf.compat.v1.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
                                      bias_initializer=b_initializer, name='q')

    with tf.compat.v1.variable_scope('target_net'):
        t1 = tf.compat.v1.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
                             bias_initializer=b_initializer, name='t1')
        self.q_next = tf.compat.v1.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
                                      bias_initializer=b_initializer, name='t2')

    with tf.compat.v1.variable_scope('q_target'):
        q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')
        self.q_target = tf.stop_gradient(q_target)

    with tf.compat.v1.variable_scope('q_eval'):
        a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
        self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)

    with tf.compat.v1.variable_scope('loss'):
        self.loss = tf.reduce_mean(tf.square(self.q_target - self.q_eval_wrt_a), name='TD_error')

    with tf.compat.v1.variable_scope('train'):
        self._train_op = tf.compat.v1.train.RMSPropOptimizer(self.lr).minimize(self.loss)

    self.target_replace_op = [tf.compat.v1.assign(t, e) for t, e in zip(
        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='target_net'),
        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='eval_net'))]

def store_transition(self, s, a, r, s_):
    transition = np.hstack((s, [a, r], s_))
    index = self.learn_step_counter % self.memory_size
    self.memory[index, :] = transition
    self.learn_step_counter += 1

def choose_action(self, observation):
    observation = observation[np.newaxis, :]
    if np.random.uniform() < self.epsilon:
        actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
        action = np.argmax(actions_value)
    else:
        action = np.random.randint(0, self.n_actions)
    return action

def learn(self):
    if self.learn_step_counter % self.replace_target_iter == 0:
        self.sess.run(self.target_replace_op)

    sample_index = np.random.choice(min(self.memory_size, self.learn_step_counter), size=self.batch_size)
    batch_memory = self.memory[sample_index, :]

    _, cost = self.sess.run(
        [self._train_op, self.loss],
        feed_dict={
            self.s: batch_memory[:, :self.n_features],
            self.a: batch_memory[:, self.n_features].astype(int),
            self.r: batch_memory[:, self.n_features + 1],
            self.s_: batch_memory[:, -self.n_features:],
        })

    self.epsilon = max(self.epsilon / (1 + self.epsilon_decay), self.epsilon_min)
    # self.epsilon = max(0, self.epsilon_max - math.exp(-1 * self.learn_step_counter / self.epsilon_decay))

    return cost`

Here is the averge reward result: final DQN results

tensorflow / tfjs

DQN not learning #8317