Open zdarktknight opened 6 years ago
Hi @zdarktknight
Which Numpy method are you using? It would be useful if you post your code here.
Thank you very much for your reply. I am using python 3.6 with numpy version 1.15.1. Here is my code:
import numpy as np
def return_expected_action(u, T, v):
actions_array = np.zeros(4)
for action in range(4):
#Expected utility of doing a in state s, according to T and u.
actions_array[action] = np.sum(np.multiply(u, np.dot(v, T[:,:,action])))
return np.argmax(actions_array)
def return_policy_evaluation_linalg(p, r, T, gamma):
u = np.zeros(12)
for s in range(12):
if not np.isnan(p[s]):
action = int(p[s])
u[s] = np.linalg.solve(np.identity(12) - gamma*T[:,:,action], r)[s]
return u
def linalg():
gamma = 0.999
epsilon = 0.0001
iteration = 0
T = np.load("T.npy")
p = np.random.randint(0, 4, size=(12)).astype(np.float32)
p[5] = np.NaN
p[3] = p[7] = -1
#Utility vectors
u = np.array([0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0])
#Reward vector
r = np.array([-0.04, -0.04, -0.04, +1.0,
-0.04, 0.0, -0.04, -1.0,
-0.04, -0.04, -0.04, -0.04])
while True:
iteration = iteration + 1
# 1. Policy evaluation
u_old = u.copy()
u = return_policy_evaluation_linalg(p, r, T, gamma) # old Utility, old actions
# Stoping acriteria
unchanged = True
for s in range(12):
if not np.isnan(p[s]) and not p[s] == -1:
v = np.zeros((1,12))
v[0,s] = 1.0
# Policy improvement
a = return_expected_action(u, T, v)
if a != p[s]: # update action
p[s] = a
unchanged = False
if unchanged:
break
# Numerical problem ??? cannot stop with this criteria
if iteration == 1000:
print('stop by iteration limit')
break
print("=================== FINAL RESULT ==================")
print("Iterations: " + str(iteration))
print("Gamma: " + str(gamma))
print("Epsilon: " + str(epsilon))
print("===================================================")
print(u[0:4])
print(u[4:8])
print(u[8:12])
print("===================================================")
print_policy(p, shape=(3,4))
print("===================================================")
def print_policy(p, shape):
counter = 0
policy_string = ""
for row in range(shape[0]):
for col in range(shape[1]):
if(p[counter] == -1): policy_string += " * "
elif(p[counter] == 0): policy_string += " ^ "
elif(p[counter] == 1): policy_string += " < "
elif(p[counter] == 2): policy_string += " v "
elif(p[counter] == 3): policy_string += " > "
elif(np.isnan(p[counter])): policy_string += " # "
counter += 1
policy_string += '\n'
print(policy_string)
if __name__ == "__main__":
linalg()
Hello @mpatacchiola This repo is excellent work! I learned a lot from this repo. I was facing the same issue when I ran the algebraic approach. I was hoping if there's any valid justification? I believe there's an issue with the stopping criteria as is.
To recall: I believe the code is implemented well! However, for the algebraic approach, when I looked at the pseudocode mentioned in the book ( ch.17 - Figure 17.7), it's not the same as you have it in your code.
At line 186 in your code it shows if a != p[s]:
but in the book it's if a > p[s]:
Best,
This is an excellent example. However, when I tried the linear algebra approach in the mdp post, the while loop cannot stop.