Four Rooms (Sutton et al. 1999) environment

Implement the classic four rooms environment. Start with old numpy implementation from HRL MSc thesis:

import numpy as np
import copy

# Action definitions
RIGHT = 0
UP    = 1
LEFT  = 2
DOWN  = 3

class RoomWorld():
    """The environment for Sutton's semi-MDP HRL.
    """
    def __init__(self, goal_position=[7,9], env_noise=0.1):
        """Map of the rooms. -1 indicates wall, 0 indicates hallway,
           positive numbers indicate numbered rooms
        """
        self.numbered_map = np.array([
        [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1],
        [-1, 1, 1, 1, 1, 1,-1, 2, 2, 2, 2, 2,-1],
        [-1, 1, 1, 1, 1, 1,-1, 2, 2, 2, 2, 2,-1],
        [-1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2,-1],
        [-1, 1, 1, 1, 1, 1,-1, 2, 2, 2, 2, 2,-1],
        [-1, 1, 1, 1, 1, 1,-1, 2, 2, 2, 2, 2,-1],
        [-1,-1, 0,-1,-1,-1,-1, 2, 2, 2, 2, 2,-1],
        [-1, 3, 3, 3, 3, 3,-1,-1,-1, 0,-1,-1,-1],
        [-1, 3, 3, 3, 3, 3,-1, 4, 4, 4, 4, 4,-1],
        [-1, 3, 3, 3, 3, 3,-1, 4, 4, 4, 4, 4,-1],
        [-1, 3, 3, 3, 3, 3, 0, 4, 4, 4, 4, 4,-1],
        [-1, 3, 3, 3, 3, 3,-1, 4, 4, 4, 4, 4,-1],
        [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]])
        self.walkability_map = np.array([
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
        self.state_space   = np.argwhere(self.walkability_map)
        self.action_space  = np.arange(4)
        self.goal_position = np.array(goal_position)
        self.action_success_rate = 1 - env_noise
        self.agents = [] # agents affect each other's observations, so should be included
        # Rewards
        self.step_reward      = 0.0 #-0.1 (Sutton used 0 and depended on discounting effect of gamma to push toward more efficient policies)
        self.collision_reward = 0.0 # was -0.1 at first, but spending a
                                    # timestep without moving is a penalty
        self.goal_reward      = 1.#10.
        self.invalid_plan_reward = 0.0#-10.

    def add_agent(self,agent):
        """Adds an agent to the environment after giving it an identifier
        """
        agent.sebango = len(self.agents) + 2
        self.agents.append(agent)

    def move_agent(self,direction,sebango=2):
        """Attempts moving an agent in a specified direction.
           If the move would put the agent in a wall, the agent remains
           where he is and is given a negative reward value.
        """
        agent  = self.agents[sebango-2]
        new_pos = agent.move(direction)
        if self.walkability_map[tuple(new_pos)].all():
            agent.set_position(new_pos)
            collision = False
        else:
            collision = True
        return collision

    def evaluate_reward(self,sebango=2,collision=False):
        """Calculates the reward to be given for the current timestep after an
           action has been taken.
        """
        agent  = self.agents[sebango-2]
        reward = self.step_reward
        done   = False
        if collision:
            reward += self.collision_reward
        if (agent.get_position() == self.goal_position).all():
            reward += self.goal_reward
            done = True
        return reward, done

    def get_observation_map(self):
        """Returns the observation of the current state as a walkability map
           with agents (sebango) and goal position (-1) labeled
        """
        obs = copy.copy(self.walkability_map)
        for ag in self.agents:
            obs[tuple(ag.get_position())] = ag.sebango
        obs[tuple(self.goal_position)] = -1
        return obs

    def get_observation_pos(self,sebango):
        """Returns the observation of the current state as the position of the
           agent indicated by sebango.
           Assumes single agent and static goal location so only need agent pos
        """
        return self.agents[sebango-2].get_position()

    def step(self,direction,sebango=2):
        """Takes one timestep with a specific direction.
           Only deals with primitive actions.
           Determines the actual direction of motion stochastically
           Determines the reward and returns reward and observation.
           Observation is the walkability map + other info:
             - the agent indicated by its sebango (a number 2 or greater)
             - The goal is indicated as -1 in the observation map.
        """
        roll   = np.random.random()
        sr = self.action_success_rate
        fr = 1.0 - sr
        if roll <= sr:
            coll = self.move_agent(direction,sebango)
        elif roll <= sr+fr/3.:
            coll = self.move_agent((direction+1)%4,sebango)
        elif roll <= sr+fr*2./3.:
            coll = self.move_agent((direction+2)%4,sebango)
        else:
            coll = self.move_agent((direction+3)%4,sebango)
        obs = self.get_observation_pos(2)
        reward, done = self.evaluate_reward(sebango, collision=coll)
        return obs, reward, done

    def reset(self, random_placement=False):
        """Resets the state of the world, putting all registered  agents back
           to their initial positions (positions set at instantiation),
           unless random_placement = True
        """
        if random_placement:
            random_index     = np.random.randint(low=0,
                    high=self.state_space.shape[0],size=len(self.agents))
            for i,ag in enumerate(self.agents):
                ag.set_position(self.state_space[random_index[i]])
        else:
            for ag in self.agents:
                ag.set_position(ag.initial_position)
        obs = self.get_observation_pos(2)    # CURRENTLY ASSUMING ONE AGENT!
        return obs
RobertTLange / gymnax

Four Rooms (Sutton et al. 1999) environment #12