Convergence of Actor critic algorthim

AhdHazim commented 1 month ago

Hello,

I am using the following actor critic agent, but it does not converge at all. Please could you help me with this matter. In the following the code and the results

` import numpy as np import tensorflow as tf import tensorflow.compat.v1 as tf import json

tf.disable_v2_behavior() tf.reset_default_graph() class A2CLSTM(object): def init( self, sess, n_actions, n_features, lr_a, lr_c, entropy_beta, batch_size=32 # Default batch size ): self.sess = sess self.n_actions = n_actions self.n_features = n_features self.lr_a = lr_a self.lr_c = lr_c self.entroy_beta = entropy_beta self.batch_size = batch_size # Set the batch size

    self.lstm_cell_size = 64

    OPT_A = tf.train.AdamOptimizer(self.lr_a)
    OPT_C = tf.train.AdamOptimizer(self.lr_c)

    with tf.name_scope('inputs'):
        self.s = tf.placeholder(tf.float32, [None, self.n_features], "state")
        self.a = tf.placeholder(tf.int32, [None, 1], "action")
        self.td_target = tf.placeholder(tf.float32, [None, 1], "td_target")

    self.acts_prob, self.v, self.a_params, self.c_params = self._build_net()

    with tf.name_scope('TD_error'):
        self.td_error = tf.subtract(self.td_target, self.v, name='TD_error')

    with tf.name_scope('c_loss'):
        self.c_loss = tf.reduce_mean(tf.square(self.td_error))

    with tf.name_scope('a_loss'):
        log_prob = tf.reduce_sum(tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, self.n_actions, dtype=tf.float32),
                                  axis=1, keepdims=True)
        exp_v = log_prob * tf.stop_gradient(self.td_error)
        entropy = -tf.reduce_sum(self.acts_prob * tf.log(self.acts_prob + 1e-5), axis=1,
                                  keepdims=True)  # encourage exploration
        self.exp_v = self.entroy_beta * entropy + exp_v
        self.a_loss = tf.reduce_mean(-self.exp_v)

    with tf.name_scope('compute_grads'):
        self.a_grads = tf.gradients(self.a_loss, self.a_params)
        self.c_grads = tf.gradients(self.c_loss, self.c_params)

    with tf.name_scope('c_train'):
        self.c_train_op = OPT_C.apply_gradients(zip(self.c_grads, self.c_params))

    with tf.name_scope('a_train'):
        self.a_train_op = OPT_A.apply_gradients(zip(self.a_grads, self.a_params))

    self.sess.run(tf.global_variables_initializer())

    # Initialize lists to store losses
    self.actor_loss_history = []
    self.critic_loss_history = []

def _build_net(self):
        w_init = tf.random_normal_initializer(0., .1)
        b_init = tf.constant_initializer(0.1)

        with tf.variable_scope('Critic'):
            # [time_step, feature] => [time_step, batch, feature]
            s = tf.expand_dims(self.s, axis=1, name='timely_input')

            lstm_cell = tf.nn.rnn_cell.LSTMCell(self.lstm_cell_size)
            self.lstm_state_init = lstm_cell.zero_state(batch_size=1, dtype=tf.float32)

            outputs, _ = tf.nn.dynamic_rnn(
                cell=lstm_cell,
                inputs=s,
                initial_state=self.lstm_state_init,
                time_major=True
            )
            cell_out = tf.reshape(outputs[-1, :, :], [-1, self.lstm_cell_size],
                                  name='flatten_lstm_outputs')  # joined state representation

            l_c1 = tf.layers.dense(
                inputs=cell_out,
                units=32,
                activation=tf.nn.tanh,
                kernel_initializer=w_init,
                bias_initializer=b_init,
                name='l_c1'
            )

            v = tf.layers.dense(
                inputs=l_c1,
                units=1,
                kernel_initializer=w_init,
                bias_initializer=b_init,
                name='V'
            )  # state value

        with tf.variable_scope('Actor'):
            l_a1 = tf.layers.dense(
                inputs=cell_out,
                units=32,  # number of hidden units
                activation=tf.nn.tanh,  # the activation function
                kernel_initializer=w_init,  # weights
                bias_initializer=b_init,  # biases
                name='l_a1'
            )

            acts_prob = tf.layers.dense(
                inputs=l_a1,
                units=self.n_actions,  # output units
                activation=tf.nn.softmax,  # get action probabilities
                kernel_initializer=w_init,  # weights
                name='acts_prob'
            )
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')

        return acts_prob, v, a_params, c_params

def choose_action(self, s):
        probs = self.sess.run(self.acts_prob, feed_dict={self.s: s})  # get probabilities for all actions
        a = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
        return a
def learn(self, feed_dict):
    # Ensure that feed_dict contains batches of data
    _, _, actor_loss, critic_loss = self.sess.run(
        [self.a_train_op, self.c_train_op, self.a_loss, self.c_loss],
        feed_dict=feed_dict
    )

    # Append current losses to history
    self.actor_loss_history.append(float(actor_loss))
    self.critic_loss_history.append(float(critic_loss))

    # Save losses to JSON files every 100 iterations
    if len(self.actor_loss_history) % 100 == 0:
        actor_loss_filename = 'logs/actor_loss.json'
        critic_loss_filename = 'logs/critic_loss.json'

        with open(actor_loss_filename, 'w') as f:
            json.dump({'actor_loss': self.actor_loss_history}, f)

        with open(critic_loss_filename, 'w') as f:
            json.dump({'critic_loss': self.critic_loss_history}, f)

def choose_action(self, s):
    probs = self.sess.run(self.acts_prob, feed_dict={self.s: s})  # get probabilities for all actions
    a = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
    return a

def target_v(self, s):
    v = self.sess.run(self.v, {self.s: s})
    return v

` Fig_1_Actor_loss Fig_1_Critic_Loss Reward Function

For the learning rate, I have tried different values, and here is the most recent one: LRA =0.005 Learning Rate Actor =0.005 Learning Rate _Critic= 0.008 GAMMA = 0.9 ENTROY_BETA = 0.01

shmishra99 commented 1 month ago

Hi @AhdHazim ,

Based on the code snippet you provided, the issue you're experiencing appears to be related to TensorFlow rather than TFJS specifically. Please open a new issue under the TensorFlow repository.

Thank You!!

AhdHazim commented 1 month ago

Hi,

Oh, Thank you for letting me know!

From: Shivam Mishra @.> Sent: 08 August 2024 18:13 To: tensorflow/tfjs @.> Cc: Sabr, Ohood @.>; Mention @.> Subject: Re: [tensorflow/tfjs] Convergence of Actor critic algorthim (Issue #8357)

[ATTENTION : Ce courriel provient de l'extérieur de l'ÉTS] Évitez de cliquer sur un lien ou d'ouvrir une pièce jointe si vous ne connaissez pas l'expéditeur du courriel. En cas de doute, veuillez SVP créer un billet Problème de courriel au GUShttps://gus.etsmtl.ca/c2atom/LoginAzure?landingPage=/portal-request-form/714d9e14-bc17-47b8-a0e6-e154184ecbc2 .

Hi @AhdHazimhttps://github.com/AhdHazim ,

Based on the code snippet you provided, the issue you're experiencing appears to be related to TensorFlow rather than TFJS specifically. Please open a new issue under the TensorFlowhttps://github.com/tensorflow/tensorflow/issues/new/choose repository.

Thank You!!

— Reply to this email directly, view it on GitHubhttps://github.com/tensorflow/tfjs/issues/8357#issuecomment-2276295013, or unsubscribehttps://github.com/notifications/unsubscribe-auth/BDSWQXZ3URO4WBDR2FJPZQLZQORLBAVCNFSM6AAAAABMEK2FP6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDENZWGI4TKMBRGM. You are receiving this because you were mentioned.Message ID: @.***>

shmishra99 commented 1 month ago

Sure, Kindly close this issue. For additional support related to TensorFlow.js please feel free to open a new issue.

Thank You!!

github-actions[bot] commented 3 weeks ago

This issue has been marked stale because it has no recent activity since 7 days. It will be closed if no further activity occurs. Thank you.

github-actions[bot] commented 2 weeks ago

This issue was closed due to lack of activity after being marked stale for past 7 days.

google-ml-butler[bot] commented 2 weeks ago

Are you satisfied with the resolution of your issue? Yes No

tensorflow / tfjs

Convergence of Actor critic algorthim #8357