Deep Galerkin Network - Githubissues

I am trying to use Deep Galerkin Method (DGM) to solve high dimensional PDEs and I face a problem. For illustrative purposes, I am posting a simple optimization problem below. The feed-forward network successfully recovers the optimal funciton, but DGM network fails to do so. Any help is highly appreciated.
import logging, os 
os.system('clear')
logging.disable(logging.WARNING) 
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np 
import pandas as pd 
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# CLASS DEFINITIONS FOR NEURAL NETWORKS USED IN DEEP GALERKIN METHOD

#%% import needed packages
import tensorflow as tf

#%% LSTM-like layer used in DGM (see Figure 5.3 and set of equations on p. 45) - modification of Keras layer class

class LSTMLayer(tf.keras.layers.Layer):

    # constructor/initializer function (automatically called when new instance of class is created)
    def __init__(self, output_dim, input_dim, trans1 = "tanh", trans2 = "tanh"):
        '''
        Args:
            input_dim (int):       dimensionality of input data
            output_dim (int):      number of outputs for LSTM layers
            trans1, trans2 (str):  activation functions used inside the layer; 
                                   one of: "tanh" (default), "relu" or "sigmoid"

        Returns: customized Keras layer object used as intermediate layers in DGM
        '''        

        # create an instance of a Layer object (call initialize function of superclass of LSTMLayer)
        super(LSTMLayer, self).__init__()

        # add properties for layer including activation functions used inside the layer  
        self.output_dim = output_dim
        self.input_dim = input_dim

        if trans1 == "tanh":
            self.trans1 = tf.nn.tanh
        elif trans1 == "relu":
            self.trans1 = tf.nn.relu
        elif trans1 == "sigmoid":
            self.trans1 = tf.nn.sigmoid

        if trans2 == "tanh":
            self.trans2 = tf.nn.tanh
        elif trans2 == "relu":
            self.trans2 = tf.nn.relu
        elif trans2 == "sigmoid":
            self.trans2 = tf.nn.relu

        ### define LSTM layer parameters (use Xavier initialization)
        # u vectors (weighting vectors for inputs original inputs x)
        self.Uz = self.add_variable("Uz", shape=[self.input_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Ug = self.add_variable("Ug", shape=[self.input_dim ,self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Ur = self.add_variable("Ur", shape=[self.input_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Uh = self.add_variable("Uh", shape=[self.input_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())

        # w vectors (weighting vectors for output of previous layer)        
        self.Wz = self.add_variable("Wz", shape=[self.output_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Wg = self.add_variable("Wg", shape=[self.output_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Wr = self.add_variable("Wr", shape=[self.output_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())
        self.Wh = self.add_variable("Wh", shape=[self.output_dim, self.output_dim],
                                    initializer = tf.contrib.layers.xavier_initializer())

        # bias vectors
        self.bz = self.add_variable("bz", shape=[1, self.output_dim])
        self.bg = self.add_variable("bg", shape=[1, self.output_dim])
        self.br = self.add_variable("br", shape=[1, self.output_dim])
        self.bh = self.add_variable("bh", shape=[1, self.output_dim])

    # main function to be called 
    def call(self, S, X):
        '''Compute output of a LSTMLayer for a given inputs S,X .    

        Args:            
            S: output of previous layer
            X: data input

        Returns: customized Keras layer object used as intermediate layers in DGM
        '''   

        # compute components of LSTM layer output (note H uses a separate activation function)
        Z = self.trans1(tf.add(tf.add(tf.matmul(X,self.Uz), tf.matmul(S,self.Wz)), self.bz))
        G = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ug), tf.matmul(S, self.Wg)), self.bg))
        R = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ur), tf.matmul(S, self.Wr)), self.br))

        H = self.trans2(tf.add(tf.add(tf.matmul(X,self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))

        # compute LSTM layer output
        S_new = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z,S))

        return S_new

#%% Fully connected (dense) layer - modification of Keras layer class

class DenseLayer(tf.keras.layers.Layer):

    # constructor/initializer function (automatically called when new instance of class is created)
    def __init__(self, output_dim, input_dim, transformation=None):
        '''
        Args:
            input_dim:       dimensionality of input data
            output_dim:      number of outputs for dense layer
            transformation:  activation function used inside the layer; using
                             None is equivalent to the identity map 

        Returns: customized Keras (fully connected) layer object 
        '''        

        # create an instance of a Layer object (call initialize function of superclass of DenseLayer)
        super(DenseLayer,self).__init__()
        self.output_dim = output_dim
        self.input_dim = input_dim

        ### define dense layer parameters (use Xavier initialization)
        # w vectors (weighting vectors for output of previous layer)
        self.W = self.add_variable("W", shape=[self.input_dim, self.output_dim],
                                   initializer = tf.contrib.layers.xavier_initializer())

        # bias vectors
        self.b = self.add_variable("b", shape=[1, self.output_dim])

        if transformation:
            if transformation == "tanh":
                self.transformation = tf.tanh
            elif transformation == "relu":
                self.transformation = tf.nn.relu
        else:
            self.transformation = transformation

    # main function to be called 
    def call(self,X):
        '''Compute output of a dense layer for a given input X 

        Args:                        
            X: input to layer            
        '''

        # compute dense layer output
        S = tf.add(tf.matmul(X, self.W), self.b)

        if self.transformation:
            S = self.transformation(S)

        return S

#%% Neural network architecture used in DGM - modification of Keras Model class

class DGMNet(tf.keras.Model):

    # constructor/initializer function (automatically called when new instance of class is created)
    def __init__(self, layer_width, n_layers, input_dim, final_trans=None):
        '''
        Args:
            layer_width: 
            n_layers:    number of intermediate LSTM layers
            input_dim:   spaital dimension of input data (EXCLUDES time dimension)
            final_trans: transformation used in final layer

        Returns: customized Keras model object representing DGM neural network
        '''  

        # create an instance of a Model object (call initialize function of superclass of DGMNet)
        super(DGMNet,self).__init__()

        # define initial layer as fully connected 
        # NOTE: to account for time inputs we use input_dim+1 as the input dimensionality
        self.initial_layer = DenseLayer(layer_width, input_dim, transformation = "tanh")

        # define intermediate LSTM layers
        self.n_layers = n_layers
        self.LSTMLayerList = []

        for _ in range(self.n_layers):
            self.LSTMLayerList.append(LSTMLayer(layer_width, input_dim))

        # define final layer as fully connected with a single output (function value)
        self.final_layer = DenseLayer(1, layer_width, transformation = final_trans)

    # main function to be called  
    def call(self,x):
        '''            
        Args:
            t: sampled time inputs 
            x: sampled space inputs

        Run the DGM model and obtain fitted function value at the inputs (t,x)                
        '''  

        # define input vector as time-space pairs
        X = tf.concat([x],1)

        # call initial layer
        S = self.initial_layer.call(X)

        # call intermediate LSTM layers
        for i in range(self.n_layers):
            S = self.LSTMLayerList[i].call(S,X)

        # call final LSTM layers
        result = self.final_layer.call(S)

        return result

#%% main class

class check():
        def __init__(self,v,x,layers,learning_rate,adam_iter,params):
            self.params=params
            self.v=v
            self.x=x
            self.learning_rate = learning_rate
            self.adam_iter = adam_iter
            self.lb = np.array([self.x[0][0]])
            self.ub = np.array([self.x[-1][0]])

            self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True))
            self.x_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
            self.v_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
            self.x_u_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
            self.v_u_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])

            self.weights_v,self.biases_v = self.initialize_nn(layers)
            self.weights_i,self.biases_i = self.initialize_nn(layers)

            with tf.variable_scope("control",reuse=True):
                self.i_pred = self.net_i(self.x_tf)

            with tf.variable_scope("value",reuse=True):
                self.v_pred = self.net_v(self.x_tf)

            self.error_i = self.policy_error(self.x_tf)

            self.loss_v = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf))

            self.loss =  tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf)) + tf.reduce_mean(tf.square(self.error_i))

            self.optimizer_Adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
            self.optimizer_Adam_v = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_op_Adam_v = self.optimizer_Adam.minimize(self.loss_v)

            init = tf.global_variables_initializer()
            self.sess.run(init)

        def policy_error(self,x):
            i=self.net_i(x)
            v_ = self.net_v(x+i)
            l = v_ - i*x**2
            error_i = tf.gradients(l,i)[0] 
            return error_i

        def initialize_nn(self,layers):
                weights = []
                biases = []
                num_layers = len(layers)
                for l in range(num_layers-1):
                    W = self.xavier_init(size = [layers[l],layers[l+1]])
                    b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype = tf.float32)
                    weights.append(W)
                    biases.append(b)

                return weights,biases            
        def xavier_init(self,size):
                in_dim = size[0]
                out_dim = size[1]
                xavier_stddev = np.sqrt(2/(in_dim + out_dim))
                try:
                    val = tf.Variable(tf.random.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
                except:
                    val = tf.Variable(tf.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
                return val

        def neural_net(self,X,weights,biases):
                num_layers = len(weights) +1
                H = 2.0*(X - self.lb)/(self.ub - self.lb) -1
                #H=X
                for l in range(num_layers-2):
                    W = weights[l]
                    b = biases[l]
                    H = tf.tanh(tf.add(tf.matmul(H,W),b))
                W = weights[-1]
                b = biases[-1]
                Y = tf.add(tf.matmul(H,W),b)

                return Y
        def net_v(self,eta):
                if self.params['DGM']==True:
                    model_v = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
                    v_u = model_v(eta)

                else:
                    X = tf.concat([eta],1)
                    v_u = self.neural_net(X,self.weights_v,self.biases_v)
                return v_u
        def net_i(self,eta):
                if self.params['DGM']==True:
                    model_i = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
                    i_u = model_i(eta)

                else:
                    X = tf.concat([eta],1)
                    i_u = self.neural_net(X,self.weights_i,self.biases_i)
                return i_u 
        def callback(self,loss):    
                print('Loss: ',loss)

        def train(self):
            #K.clear_session()

                start_time = time.time()
                if True: #set this to true if you want adam to run 
                    tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
                    for it in range(self.adam_iter):
                        self.sess.run(self.train_op_Adam_v, tf_dict)
                        # Print
                        if it % 1000 == 0:
                            elapsed = time.time() - start_time
                            loss_value = self.sess.run(self.loss_v, tf_dict)
                            print('It: %d, Loss: %.3e, Time: %.2f' % 
                                  (it, loss_value, elapsed))
                            start_time = time.time()

                    start_time = time.time()

                if True: #set this to true if you want adam to run 
                    tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
                    for it in range(self.adam_iter):
                        self.sess.run(self.train_op_Adam, tf_dict)
                        # Print
                        if it % 1000 == 0:
                            elapsed = time.time() - start_time
                            loss_value = self.sess.run(self.loss, tf_dict)
                            print('It: %d, Loss: %.3e, Time: %.2f' % 
                                  (it, loss_value, elapsed))
                            start_time = time.time()

                    start_time = time.time()

        def predict(self,X_star):
            i_star = self.sess.run(self.i_pred,{self.x_tf: X_star[:,0:1]})            
            v_star = self.sess.run(self.v_pred,{self.x_tf: X_star[:,0:1]}) 
            error = self.sess.run(self.error_i,{self.x_tf: X_star[:,0:1]})
            tf.reset_default_graph()
            return i_star,v_star,error

#%%
if __name__=="__main__":
        params={'DGM':True,'neurons_per_layer':50,'num_layers':4}
        x=np.linspace(-1,1,100).reshape(-1,1).astype(np.float32)
        v=(10 - x**2).reshape(-1,1).astype(np.float32)
        #architecture for feed-forward network
        layers =  [1, 10,1]
        learning_rate = 0.001
        adam_iter = 5000

        run = check(v,x,layers,learning_rate,adam_iter,params)
        run.train()

        i_star,v_star,error=run.predict(x)
The problem is to find the optimal function i that maximizes the function v=10-(x+i^2)-ix^2, where x is the state variable. That is, the optimal function i will depend on x. If I set 'DGM' as False in the parameter dictionary and run the code, I get the right solution (in this case the functions are coded as feed-forward neural network), where the correct analytical solution is i_star = 0.5*(-2x-x^2). If I set 'DGM' as False, the solution is incorrect. I tried with different number of layers and number of neurons per each layer, but DGM always gives incorrect solution.
Am I doing something wrong? Many thanks.
adolfocorreia / DGM

Deep Galerkin Network #3