ShangtongZhang / reinforcement-learning-an-introduction

Python Implementation of Reinforcement Learning: An Introduction
MIT License
13.45k stars 4.81k forks source link

Unable to get the same results while formulating differently #134

Closed rohitdavas closed 3 years ago

rohitdavas commented 3 years ago

I have been trying to write my own version of the code for figure 2.2 but I am unable to get the required type of graph.

Ofcourse, your code is working fine. I was taking help of your code and trying to formulate as Environment - agent interaction rather than your code which is framing both in one class.

Can you see what should I improve ? I am not able to find mistake.

#graph support 
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm 

class Agent:
    ''' agent for bandit env 
        agent have following responsibilities

            - choosing the best action
            - updating the action value

    ''' 

    def __init__ ( self, epsilon = 0, n_actions = 10 ):
        self.epsilon = epsilon 
        self.q = np.zeros ( n_actions )
        self.q_steps = np.zeros_like ( self.q ) 
        self.n_actions = n_actions 

    def action ( self ) :
        if np.random.randn () < self.epsilon :
            return np.random.choice ( self.n_actions ) 
        else:
            q_best = np.max ( self.q )
            return np.random.choice ( np.where ( self.q == q_best)[0] ) 

    def _sampleAverage( self, action, reward ): 
        step_size = self.q_steps[action] 
        self.q[action] +=  ( reward - self.q[action] ) / step_size 

    def reset(self):
        self.q = np.zeros ( self.n_actions ) 
        self.q_steps = np.zeros_like ( self.q )

    def update ( self, action, reward ):
        self.q_steps[action] += 1
        self._sampleAverage( action, reward )

class Env: 
    ''' a k-arm bandit environment
        environment has following beaviour
        - given an action, return a reward 
        - there is no observation in bandits env
     ''' 

    def __init__( self, n_arm = 10) :
        self.n = n_arm

    def _init_reward ( self ): 
        # rewards are sampled from normal distribution of
        # mean = mean q_star(At) 
        # variance = 1 
        # thus the need to initialise the self.q_star 
        self.q_star =  np.random.randn ( self.n ) 

    def reward ( self, action, var = 1  ) :
        return var * np.random.randn() + self.q_star[action] 

    def reset( self ) : 
        self._init_reward ()  
        self.best_action = np.argmax ( self.q_star ) 

    def step ( self, action ) :
        return self.reward ( action ) 

#------------------------------------------
# main function to simulate the behavior
#------------------------------------------

def single_run (  bandit_env, my_agent, steps = 1000 ):
    my_agent.reset()
    bandit_env.reset() 
    reward_series = np.zeros ( steps )

    for i in range ( steps ):
        action = my_agent.action() 
        reward = bandit_env.step(action)
        reward_series[i] = reward 
        my_agent.update ( action, reward ) 

    return reward_series 

def q2( runs = 2000 ) :
    epsilons = [ 0, 0.1, 0.01 ] 
    plt.figure( figsize= (10,20) ) 
    plt.xlabel ( "steps")
    plt.ylabel(" Average rewards") 

    for e in epsilons:
        reward_series = [] 
        my_agent = Agent(epsilon = e, n_actions= 10) 
        bandit_env = Env ( n_arm= 10)

        for _ in tqdm ( range( runs ) ) :
            reward_series.append ( single_run ( bandit_env, my_agent, steps = 1000 )  )
        plt.plot ( np.mean( reward_series, axis = 0 ) , label = "epsilon " + str (e) ) 

    plt.legend() 
    plt.savefig("./images/q2.png") 
    plt.close() 

if __name__ == "__main__":
    q2() 

Here is the graph I am getting :

The max value is not crossing 0.8 q2

rohitdavas commented 3 years ago

I figured it out. thanks. closing it.