[[0, 0.25, 0.75],
[0.25, 0, 0.75],
[0.25, 0.25, 0.5]]]) # A * S * S
R = np.array([[0.55, 0.75], [1, 0.8], [1.2, 1]]) # S * A
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.000000001, [0, 0, 0])
pi.run()
print(pi.policy)
I want to solve this problem actually without discounting, and if this is not possible with a very small discount but whether I use 1, 0.000000001 or 0.999999 I get the same result which is wrong since we solved this exercise in my class and the results there are correct. What am I doing wrong?
I tried to use this library for the policy iteration. I used this code:
P = np.array([[[0.25, 0.25, 0.5], [0.75, 0, 0.25], [0.5, 0.5, 0]],
R = np.array([[0.55, 0.75], [1, 0.8], [1.2, 1]]) # S * A
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.000000001, [0, 0, 0]) pi.run() print(pi.policy)
I want to solve this problem actually without discounting, and if this is not possible with a very small discount but whether I use 1, 0.000000001 or 0.999999 I get the same result which is wrong since we solved this exercise in my class and the results there are correct. What am I doing wrong?