tensorflow / tfjs

A WebGL accelerated JavaScript library for training and deploying ML models.
https://js.tensorflow.org
Apache License 2.0
18.26k stars 1.92k forks source link

DQN not learning #8317

Open AhdHazim opened 6 days ago

AhdHazim commented 6 days ago

Hi everyone,

I am using the following DQN agent and it does not learning. Could you please let me know if i missed something.

Here is the DQN code:

`import numpy as np import tensorflow as tf import json import os import math

class DQN: def init(self, n_actions, n_features, lr= 0.001, reward_decay=0.9, e_greedy=0.9, epsilon_min=0.01, replace_target_iter=300, memory_size=10000, batch_size=32, e_greedy_decay=1e-5): self.n_actions = n_actions self.n_features = n_features self.lr = lr epsilon_max=0.9 self.gamma = reward_decay self.epsilon_decay = e_greedy_decay self.epsilon_min = epsilon_min self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.learn_step_counter = 0 self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) self.loss_history = [] self.reward_history = [] self.epsilon = 0 if self.epsilon_decay is not None else self.epsilon_max

    self.graph = tf.Graph()
    with self.graph.as_default():
        self.sess = tf.compat.v1.Session()
        self._build_net()
        self.sess.run(tf.compat.v1.global_variables_initializer())

def _build_net(self):
    self.s = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s')
    self.s_ = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s_')
    self.r = tf.compat.v1.placeholder(tf.float32, [None,], name='r')
    self.a = tf.compat.v1.placeholder(tf.int32, [None,], name='a')

    w_initializer = tf.random_normal_initializer(0., 0.3)
    b_initializer = tf.constant_initializer(0.1)

    with tf.compat.v1.variable_scope('eval_net'):
        e1 = tf.compat.v1.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
                             bias_initializer=b_initializer, name='e1')
        self.q_eval = tf.compat.v1.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
                                      bias_initializer=b_initializer, name='q')

    with tf.compat.v1.variable_scope('target_net'):
        t1 = tf.compat.v1.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
                             bias_initializer=b_initializer, name='t1')
        self.q_next = tf.compat.v1.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
                                      bias_initializer=b_initializer, name='t2')

    with tf.compat.v1.variable_scope('q_target'):
        q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')
        self.q_target = tf.stop_gradient(q_target)

    with tf.compat.v1.variable_scope('q_eval'):
        a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
        self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)

    with tf.compat.v1.variable_scope('loss'):
        self.loss = tf.reduce_mean(tf.square(self.q_target - self.q_eval_wrt_a), name='TD_error')

    with tf.compat.v1.variable_scope('train'):
        self._train_op = tf.compat.v1.train.RMSPropOptimizer(self.lr).minimize(self.loss)

    self.target_replace_op = [tf.compat.v1.assign(t, e) for t, e in zip(
        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='target_net'),
        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='eval_net'))]

def store_transition(self, s, a, r, s_):
    transition = np.hstack((s, [a, r], s_))
    index = self.learn_step_counter % self.memory_size
    self.memory[index, :] = transition
    self.learn_step_counter += 1

def choose_action(self, observation):
    observation = observation[np.newaxis, :]
    if np.random.uniform() < self.epsilon:
        actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
        action = np.argmax(actions_value)
    else:
        action = np.random.randint(0, self.n_actions)
    return action

def learn(self):
    if self.learn_step_counter % self.replace_target_iter == 0:
        self.sess.run(self.target_replace_op)

    sample_index = np.random.choice(min(self.memory_size, self.learn_step_counter), size=self.batch_size)
    batch_memory = self.memory[sample_index, :]

    _, cost = self.sess.run(
        [self._train_op, self.loss],
        feed_dict={
            self.s: batch_memory[:, :self.n_features],
            self.a: batch_memory[:, self.n_features].astype(int),
            self.r: batch_memory[:, self.n_features + 1],
            self.s_: batch_memory[:, -self.n_features:],
        })

    self.epsilon = max(self.epsilon / (1 + self.epsilon_decay), self.epsilon_min)
    # self.epsilon = max(0, self.epsilon_max - math.exp(-1 * self.learn_step_counter / self.epsilon_decay))

    return cost`

Here is the averge reward result: final DQN  results

gaikwadrahul8 commented 4 days ago

Hi, @AhdHazim

I apologize for the delayed response and If I'm not wrong it seems like you're not using TensorFlow.js library and using core TensorFlow so I would suggest you to please follow this official example shows how to train a DQN (Deep Q Networks) agent on the Cartpole environment using the TF-Agents library.

You can consider below points for DQN learning :

By checking these points and potentially modifying the code you can improve the learning behavior of your DQN agent. Remember that training DQN agents can be an iterative process and may require adjustments based on your specific environment and task.

If this issue is not specific to TensorFlow.js then I would request you to please post this issue in core TensorFlow repo here if your issue did not solve after trying above mentioned points.

Thank you for your cooperation and patience.