openai / gym

A toolkit for developing and comparing reinforcement learning algorithms.
https://www.gymlibrary.dev
Other
34.35k stars 8.59k forks source link

[Bug Report] Bug title unstable output for the state (aka observation) when using env.reset() #3179

Open Abdelkader-gnichi opened 1 year ago

Abdelkader-gnichi commented 1 year ago

If you are submitting a bug report, please fill in the following details and use the tag [bug].

Describe the bug i run my code and every time i get a different out put so just i want an explanation why i have different out put

Code example import numpy as np import gym import random import matplotlib.pyplot as plt from random import choice from tqdm.notebook import tqdm from kaggle_environments import evaluate, make

class ConnectX(gym.Env): def init(self, switch_prob=0.5): self.env = make('connectx', debug=True) self.pair = [None, 'negamax'] self.trainer = self.env.train(self.pair) self.switch_prob = switch_prob

# Define required gym fields (examples):
config = self.env.configuration
self.action_space = gym.spaces.Discrete(config.columns)
self.observation_space = gym.spaces.Discrete(config.columns * config.rows)

def switch_trainer(self): self.pair = self.pair[::-1] self.trainer = self.env.train(self.pair)

def step(self, action): return self.trainer.step(action)

def reset(self): if random.uniform(0, 1) < self.switch_prob: self.switch_trainer() return self.trainer.reset()

def render(self, kwargs): return self.env.render(kwargs) class QTable: def init(self, action_space): self.table = dict() self.action_space = action_space

def add_item(self, state_key): self.table[state_key] = list(np.zeros(self.action_space.n))

def call(self, state): board = state['board'][:] # Get a copy board.append(state.mark) state_key = np.array(board).astype(str) state_key = hex(int(''.join(state_key), 3))[2:] if state_key not in self.table.keys(): self.add_item(state_key)

return self.table[state_key]

env = ConnectX()

alpha = 0.1 gamma = 0.6 epsilon = 0.99 min_epsilon = 0.1

episodes = 10000

alpha_decay_step = 1000 alpha_decay_rate = 0.9 epsilon_decay_rate = 0.9999

q_table = QTable(env.action_space)

all_epochs = [] all_total_rewards = [] all_avg_rewards = [] # Last 100 steps all_qtable_rows = [] all_epsilons = []

state = env.reset() print(state.board) #this is the tricky part

Output

/bin/python /home/gadour/Music/beginig.py [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' Traceback (most recent call last): File "/home/gadour/Music/beginig.py", line 78, in print(state.board) ^^^^^^^^^^^ AttributeError: 'Struct' object has no attribute 'board' [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' Traceback (most recent call last): File "/home/gadour/Music/beginig.py", line 78, in print(state.board) ^^^^^^^^^^^ AttributeError: 'Struct' object has no attribute 'board' [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' Traceback (most recent call last): File "/home/gadour/Music/beginig.py", line 78, in print(state.board) ^^^^^^^^^^^ AttributeError: 'Struct' object has no attribute 'board' [gadour@fedora ns-3.37]$ /bin/python /home/gadour/Music/beginig.py Loading environment lux_ai_s2 failed: No module named 'vec_noise' Traceback (most recent call last): File "/home/gadour/Music/beginig.py", line 78, in print(state.board)

System Info fedora linux version 37 pip install gym python version 3.11.1 and 3.10.9

My Question

So guys why the hell some times return to me the list and some times just give me an error saying 'Struct' object has no attribute 'board' why that happen please i want logical explanation and thnx

Note: I know if i use state['board'] it work better, but i want to know why the unstable output it's 04:48pm now i can't sleep thinking, so please i'm so excited to know why it goes like that.

pseudo-rnd-thoughts commented 1 year ago

I can't reproduce your issue. Please include all of the necessary python module and indented code

Abdelkader-gnichi commented 1 year ago

ok @pseudo-rnd-thoughts this is the full code and thank you

import numpy as np import gym import random import matplotlib.pyplot as plt from random import choice from tqdm.notebook import tqdm from kaggle_environments import evaluate, make

class ConnectX(gym.Env): def init(self, switch_prob=0.5): self.env = make('connectx', debug=True) self.pair = [None, 'negamax'] self.trainer = self.env.train(self.pair) self.switch_prob = switch_prob

    # Define required gym fields (examples):
    config = self.env.configuration
    self.action_space = gym.spaces.Discrete(config.columns)
    self.observation_space = gym.spaces.Discrete(config.columns * config.rows)

def switch_trainer(self):
    self.pair = self.pair[::-1]
    self.trainer = self.env.train(self.pair)

def step(self, action):
    return self.trainer.step(action)

def reset(self):
    if random.uniform(0, 1) < self.switch_prob:
        self.switch_trainer()
    return self.trainer.reset()

def render(self, **kwargs):
    return self.env.render(**kwargs)

class QTable: def init(self, action_space): self.table = dict() self.action_space = action_space

def add_item(self, state_key):
    self.table[state_key] = list(np.zeros(self.action_space.n))

def __call__(self, state):
    board = state.board[:] # Get a copy
    board.append(state.mark)
    state_key = np.array(board).astype(str)
    state_key = hex(int(''.join(state_key), 3))[2:]
    if state_key not in self.table.keys():
        self.add_item(state_key)

    return self.table[state_key]

env = ConnectX()

q_table = QTable(env.action_space)

all_epochs = [] all_total_rewards = [] all_avg_rewards = [] # Last 100 steps all_qtable_rows = [] all_epsilons = []

for i in tqdm(range(episodes)): state = env.reset()

epsilon = max(min_epsilon, epsilon * epsilon_decay_rate)
epochs, total_rewards = 0, 0
done = False

while not done:
    if random.uniform(0, 1) < epsilon:
        action = choice([c for c in range(env.action_space.n) if state.board[c] == 0])
    else:
        row = q_table(state)[:]
        selected_items = []
        for j in range(env.action_space.n):
            if state.board[j] == 0:
                selected_items.append(row[j])
            else:
                selected_items.append(-1e7)
        action = int(np.argmax(selected_items))

    next_state, reward, done, info = env.step(action)

    # Apply new rules
    if done:
        if reward == 1: # Won
            reward = 20
        elif reward == 0: # Lost
            reward = -20
        else: # Draw
            reward = 10
    else:
        reward = -0.05 # Try to prevent the agent from taking a long move

    old_value = q_table(state)[action]
    next_max = np.max(q_table(next_state))

    # Update Q-value
    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
    q_table(state)[action] = new_value

    state = next_state
    epochs += 1
    total_rewards += reward

all_epochs.append(epochs)
all_total_rewards.append(total_rewards)
avg_rewards = np.mean(all_total_rewards[max(0, i-100):(i+1)])
all_avg_rewards.append(avg_rewards)
all_qtable_rows.append(len(q_table.table))
all_epsilons.append(epsilon)

if (i+1) % alpha_decay_step == 0:
    alpha *= alpha_decay_rate

###################################################################################### i use python 3.10.9 in conda only kaggle_environments was installed using pip

jjshoots commented 1 year ago

@pseudo-rnd-thoughts @Abdelkader-gnichi It may be because you're not using a fixed seed for the environment. This results in the environment restarting to a new state everytime.

Abdelkader-gnichi commented 1 year ago

@jjshoots I will check it and thank u