germain-hug / Deep-RL-Keras

Keras Implementation of popular Deep RL Algorithms (A3C, DDQN, DDPG, Dueling DDQN)
533 stars 149 forks source link

DQN: batch_shape = (None,) + tuple(shape) TypeError: 'int' object is not iterable #34

Open Nazanin-87 opened 2 years ago

Nazanin-87 commented 2 years ago

Hi everyone, I modified the DQN algorithm in this repository to a multi-agent DQN approach for a wireless network environment. Actually, I wrote this code inspired by a repository on GitHub. Although the original code works well, when I change the environment, the following error occurs. Traceback (most recent call last): File "D:/main -DQN.py", line 452, in <module> main() File "D:/main -DQN.py", line 432, in main algo = DQN( args) # n_clusters is the action dimension in DQN File "D:/main -DQN.py", line 158, in __init__ self.agent = Agent( args, self.tau) File "D:/main -DQN.py", line 246, in __init__ self.model = self.network() File "D:/main -DQN.py", line 254, in network inp = Input((self.state_dim)) File "C:\Users\AppData\Roaming\Python\Python37\site-packages\keras\engine\topology.py", line 1451, in Input batch_shape = (None,) + tuple(shape) TypeError: 'int' object is not iterable The complete code is as follows: ` from keras.backend.tensorflow_backend import set_session import tensorflow as tf import pandas as pd import numpy as np import sys import os import copy, json, argparse from numpy import pi from random import random, uniform, choices, randint, sample, randrange import random import math from tqdm import tqdm import keras.backend as K from keras.optimizers import Adam from keras.models import Model from keras.layers import Dense, Flatten, Input from collections import deque

class Environ:

def __init__(self, args):
    self.args=args
    self.state_dim= (self.args.A, )
    self.action_dim=args.C
    self.bs = complex((500 / 2), (500/ 2))
    self.S=(np.zeros(self.args.A)).reshape(-1)

def Location(self):
    rx = uniform(0, 500)
    ry = uniform(0, 500)
    Loc = complex(rx, ry)
    return Loc

def PathGain(self,Loc):
    d = abs(Loc- self.bs)
    d=d  **(-3)
    u = np.random.rand(1, 1)
    sigma = 1
    x = sigma * np.sqrt(-2 * np.log(u))
    h=  d* x
    return h

def reset(self):  # Reset the states
    s=np.zeros(self.args.A)
    return s.reshape(-1)

def RecievePower(self,UsersLoc):
    H=self.PathGain(UsersLoc)
    UsersRecievePower=self.args.P*H
    return UsersRecievePower

def TotalRate(self, actionRB_i,actionRB):
    interference = self.args.Noise
    Loc_i=self.Location()
    for j in range(self.args.A):
        if actionRB_i ==actionRB[j] :
            Loc_j = self.Location()
            RecievePower_j = self.RecievePower(Loc_j)
            interference = interference + RecievePower_j
        else:
            interference= interference
    RecievePower_i = self.RecievePower(Loc_i)
    SINR = interference / (interference-RecievePower_i)
    Rate =self.args.BW*( np.log2( SINR))
    return Rate

def computeQoS(self,actionRB,actionRB_i):
    TotalRate=self.TotalRate(actionRB,actionRB_i)
    if TotalRate >=self.args.Rmin:
        QoS=1.0
    else:
        QoS=0.0
    return QoS

def ComputeState(self,actionRB):
    QoS=np.zeros(self.args.A)
    for i in range(self.args.A):
        actionRB_i=actionRB[i]
        QoS[i] = self.computeQoS(actionRB,actionRB_i)
    S = np.zeros( self.args.A)
    for i in range(self.args.A):
        S[i]=QoS[i]
    self.S=S
    return self.S.reshape(-1)

def Reward(self,actionRB,actionRB_i):
    Rate = np.zeros(self.args.A)
    Satisfied_Users = 0
    for i in range(self.args.A):
        Rate[i] = self.TotalRate(actionRB, actionRB_i)
        Satisfied_Users = Satisfied_Users + self.computeQoS(actionRB)
    TotalRate = 0.0
    TotalPower = self.args.circuitPower
    for i in range(self.args.A):
        TotalRate = TotalRate + Rate[i]
        TotalPower = TotalPower + self.args.P
    if Satisfied_Users == self.args.A:
        reward = TotalRate / TotalPower
    else:
        reward = self.args.negative_cost
    return reward

def step(self,actionRB):
    next_s = self.ComputeState(actionRB)
    r = self.Reward(actionRB)
    done = False
    info = None
    return next_s, r, done, info

class Environment(object):

def __init__(self, gym_env, action_repeat):
    self.env = gym_env
    self.timespan = action_repeat
    self.gym_actions = 2  # range(gym_env.action_space.n)
    self.state_buffer = deque()

def get_action_size(self):
    return self.env.action_dim

def get_state_size(self):
    return self.env.state_dim

def reset(self):
    # Clear the state buffer
    self.state_buffer = deque()
    x_t = self.env.reset()
    s_t = np.stack([x_t for i in range(self.timespan)], axis=0)
    for i in range(self.timespan - 1):
        self.state_buffer.append(x_t)
    return s_t

def step(self, action):
    x_t1, r_t, terminal, info = self.env.step(action)
    previous_states = np.array(self.state_buffer)
    s_t1 = np.empty((self.timespan, *self.env.state_dim))
    s_t1[:self.timespan - 1, :] = previous_states
    s_t1[self.timespan - 1] = x_t1
    # Pop the oldest frame, add the current frame to the queue
    self.state_buffer.popleft()
    self.state_buffer.append(x_t1)
    return s_t1, r_t, terminal, info

def render(self):
    return self.env.render()

class DQN: def init(self, args):

Environment and DQN parameters

    self.args=args
    self.action_dim = self.args.C
    self.state_dim =  self.args.A
    self.buffer_size = self.args.capacity
    # Memory Buffer for Experience Replay
    self.buffer = MemoryBuffer(self.buffer_size)
    self.epsilon=self.args.eps
    self.tau = 1.0
    self.agent = Agent( args, self.tau)

def policy_action(self, s):
    if random() <= self.epsilon:
        return randrange(self.action_dim)
    else:
        return np.argmax(self.agent.predict(s)[0])

def train_agent(self):
    # Sample experience from memory buffer
    s, a, r, d, new_s, idx = self.buffer.sample_batch(self.batch_size)
    # Apply Bellman Equation on batch samples to train our DQN
    q  = self.agent.predict(s)
    next_q  = self.agent.predict(new_s)
    q_targ  = self.agent.target_predict(new_s)
    for i in range(s.shape[0]):
        if d[i]:
            q[i, a[i]] = r[i]
        else:
            next_best_action = np.argmax(next_q[i, :])
            q[i, a[i]] = r[i] + self.args.gamma * q_targ[i, next_best_action]
    # Train on batch
    self.agent.fit(s, q)
    # Decay epsilon
    self.epsilon *= self.args.eps_decay

def train(self, env, args, summary_writer):
    results = []
    tqdm_e = tqdm(range(self.args.nepisodes), desc='Score', leave=True, unit=" episodes")
    for e in tqdm_e:
        # Reset episode
        time, cumul_reward, done = 0, 0, False
        old_state = env.reset()

        while not done:
            # if args.render:
            #     env.render()
            # Actor picks an action (following the policy)
            a=[]
            for i in range(self.args.A):
                a[i]= self.policy_action(old_state)

            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, _ = env.step(a)
            # Memorize for experience replay
            self.memorize(old_state, a, r, done, new_state)
            # Update current state
            old_state = new_state
            cumul_reward += r
            time += 1
            # Train DDQN and transfer weights to target network
            if(self.buffer.size() > args.batch_size):
                self.train_agent(self.args.batch_size)
                self.agent.transfer_weights()
       # Gather stats every episode for plotting
        if(args.gather_stats):
            mean, stdev = gather_stats(self, env)
            results.append([e, mean, stdev])

        # Export results for Tensorboard
        score = tfSummary('score', cumul_reward)
        summary_writer.add_summary(score, global_step=e)
        summary_writer.flush()

        # Display score
        tqdm_e.set_description("Score: " + str(cumul_reward))
        tqdm_e.refresh()

    return results

def memorize(self, state, action, reward, done, new_state):
    self.buffer.memorize(state, action, reward, done, new_state)

def save_weights(self, path):
    path += '_LR_{}'.format(self.args.learningrate)
    self.agent.save(path)

def load_weights(self, path):
    self.agent.load_weights(path)

class Agent: def init(self, args, tau): self.args=args self.state_dim = self.args.A self.action_dim = self.args.C self.tau = tau self.lr=self.args.learningrate

Initialize Deep Q-Network

    self.model = self.network()
    self.model.compile(Adam(self.lr), 'mse')
    # Build target Q-Network
    self.target_model = self.network()
    self.target_model.compile(Adam(self.lr), 'mse')
    self.target_model.set_weights(self.model.get_weights())

def network(self):
    inp = Input((self.state_dim))

    if(len(self.state_dim) > 2):
        inp = Input((self.state_dim[1:]))
        x = conv_block(inp, 32, (2, 2), 8)
        x = conv_block(x, 64, (2, 2), 4)
        x = conv_block(x, 64, (2, 2), 3)
        x = Flatten()(x)
        x = Dense(256, activation='relu')(x)
    else:
        x = Flatten()(inp)
        x = Dense(64, activation='relu')(x)
        x = Dense(64, activation='relu')(x)

    x = Dense(self.action_dim, activation='linear')(x)
    return Model(inp, x)

def transfer_weights(self):
    W = self.model.get_weights()
    tgt_W = self.target_model.get_weights()
    for i in range(len(W)):
    #  updated based on Polyak averaging method
        tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i]
    self.target_model.set_weights(tgt_W)

def fit(self, inp, targ):
    self.model.fit(self.reshape(inp), targ, epochs=1, verbose=0)

def predict(self, inp):
    return self.model.predict(self.reshape(inp))

def target_predict(self, inp):
    return self.target_model.predict(self.reshape(inp))

def reshape(self, x):
    if len(x.shape) < 4 and len(self.state_dim) > 2:
        return np.expand_dims(x, axis=-1)
    elif len(x.shape) < 3:
        return np.expand_dims(x, axis=-1)
    else:
        return x

def save(self, path):
    self.model.save_weights(path + '.h5')

def load_weights(self, path):
    self.model.load_weights(path)

class MemoryBuffer(object): def init(self, buffer_size):

Standard Buffer

    self.buffer = deque()
    self.count = 0
    self.buffer_size = buffer_size

def memorize(self, state, action, reward, done, new_state):
    experience = (state, action, reward, done, new_state)
    # Check if buffer is already full
    if self.count < self.buffer_size:
        self.buffer.append(experience)
        self.count += 1
    else:
        self.buffer.popleft()
        self.buffer.append(experience)

def size(self):
    return self.count

def sample_batch(self, batch_size):
    batch = []
    if self.count < batch_size:
        idx = None
        batch = random.sample(self.buffer, self.count)
    else:
        idx = None
        batch = random.sample(self.buffer, batch_size)

    # Return a batch of experience
    s_batch = np.array([i[0] for i in batch])
    a_batch = np.array([i[1] for i in batch])
    r_batch = np.array([i[2] for i in batch])
    d_batch = np.array([i[3] for i in batch])
    new_s_batch = np.array([i[4] for i in batch])
    return s_batch, a_batch, r_batch, d_batch, new_s_batch, idx

def update(self, idx):
    self.buffer.update(idx)

def clear(self):
    self.buffer = deque()
    self.count = 0

def get_session(): config = tf.ConfigProto() config.gpu_options.allow_growth = True return tf.Session(config=config)

def tfSummary(tag, val): return tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])

def gather_stats(agent, env): score = [] for k in range(10): old_state = env.reset() cumul_r, done = 0, False while not done: a = agent.policy_action(old_state) oldstate, r, done, = env.step(a) cumul_r += r score.append(cumul_r) return np.mean(np.array(score)), np.std(np.array(score))

def conv_block(inp, d=3, pool_size=(2, 2), k=3): conv = conv_layer(d, k)(inp) return MaxPooling2D(pool_size=pool_size)(conv)

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def parse_args(args): parser = argparse.ArgumentParser(description='Training parameters') # parser.add_argument('--out_dir', type=str, default='experiments', help="Name of the output directory") parser.add_argument('--consecutive_frames', type=int, default=2, help="Number of consecutive frames (action repeat)") parser.add_argument('--gather_stats', dest='gather_stats', action='store_true', help="Compute Average reward per episode (slower)") parser.add_argument('--A', type=int, default='10', help="The number of agents") parser.add_argument('--C', type=int, default='30', help="The number of Resources") parser.add_argument('--Noise', type=float, default='0.00000000000001', help="The background noise") parser.add_argument('--BW', type=int, default='180000', help="The bandwidth") parser.add_argument('--Rmin', type=int, default='1000000', help="Agents' QoS") parser.add_argument('--P', type=float, default='0.01', help="Agents' transmit power") parser.add_argument('--circuitPower', type=float, default='0.05', help="The circuit Power") parser.add_argument('--negative_cost', type=float, default='-1.0', help="The negative cost") parser.add_argument('--capacity', type=int, default='500', help="Capacity of Replay Buffer") parser.add_argument('--learningrate', type=float, default='0.01', help="The learning rate") parser.add_argument('--eps', type=float, default='0.8', help="The epsilon") parser.add_argument('--eps_decay', type=float, default='0.99', help="The epsilon decay") parser.add_argument('--eps_increment', type=float, default='0.003', help="The epsilon increment") parser.add_argument('--batch_size', type=int, default='8', help="The batch size") parser.add_argument('--gamma', type=float, default='0.99', help="The discount factor") parser.add_argument('--nepisodes', type=int, default='500', help="The number of episodes") parser.add_argument('--nsteps', type=int, default='500', help="The number of steps") parser.add_argument('--env', type=str, default='Environ', help="Wireless environment") parser.add_argument('--gpu', type=str, default="", help='GPU ID')

args=parser.parse_args(args)

parser.set_defaults(render=False)
return args

def main(args=None):

Parse arguments

if args is None:
    args = sys.argv[1:]
args = parse_args(args)
# Check if a GPU ID was set
if args.gpu:
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
set_session(get_session())

summary_writer = tf.summary.FileWriter("/tensorboard_" + args.env)
# Initialize the wireless environment
users_env = Environ(args)
# print(users_env)

# Wrap the environment to use consecutive frames
env = Environment(users_env, args.consecutive_frames)
env.reset()

# Define parameters for the DDQN and DDPG algorithms
state_dim = env.get_state_size()
action_dim = users_env.action_dim
# The maximum and minimum values for precoding vectors
# act_range = 1
# act_min = 0

# Initialize the DQN algorithm for the clustering optimization
algo = DQN( args)  # n_clusters is the action dimension in DQN
# if args.step == "train":
    # Train
stats = algo.train(env, args, summary_writer)
# Export results to CSV
if(args.gather_stats):
    df = pd.DataFrame(np.array(stats))
    df.to_csv(args.out_dir + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f')
    # df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f')

    # Save weights and close environments
exp_dir = '{}/models_A_{}_C_{}_Rmin_{}/'.format(args.out_dir, args.A, args.C, args.Rmin)
# exp_dir = '{}/models/'.format(args.type)
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
# Save DDQN
export_path = '{}_{}_NB_EP_{}_BS_{}'.format(exp_dir, "DQN", args.nepisodes, args.batch_size)
algo.save_weights(export_path)

if name == "main": main() ` Thanks in advance for your help.