Open Deadsg opened 9 months ago
import gym import numpy as np
class SelfIteratingAgent: def init(self, num_actions, num_states, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2): self.num_actions = num_actions self.num_states = num_states self.learning_rate = learning_rate self.discount_factor = discount_factor self.exploration_prob = exploration_prob self.q_table = np.zeros((num_states, num_actions))
def choose_action(self, state): if np.random.rand() < self.exploration_prob: return np.random.choice(self.num_actions) else: return np.argmax(self.q_table[state]) def update_q_table(self, state, action, reward, next_state): best_next_action = np.argmax(self.q_table[next_state]) target = reward + self.discount_factor * self.q_table[next_state, best_next_action] self.q_table[state, action] += self.learning_rate * (target - self.q_table[state, action])
env = gym.make('CartPole-v1') num_actions = env.action_space.n num_states = env.observation_space.shape[0]
agent = SelfIteratingAgent(num_actions, num_states)
num_episodes = 1000
for episode in range(num_episodes): state = env.reset() done = False total_reward = 0
while not done: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) # Modify the reward if needed # ... total_reward += reward # Update the Q-table agent.update_q_table(state, action, reward, next_state) state = next_state print(f"Episode {episode}, Total Reward: {total_reward}")
env.close()
import gym import numpy as np
class SelfIteratingAgent: def init(self, num_actions, num_states, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2): self.num_actions = num_actions self.num_states = num_states self.learning_rate = learning_rate self.discount_factor = discount_factor self.exploration_prob = exploration_prob self.q_table = np.zeros((num_states, num_actions))
Create the CartPole environment
env = gym.make('CartPole-v1') num_actions = env.action_space.n num_states = env.observation_space.shape[0]
agent = SelfIteratingAgent(num_actions, num_states)
Training loop
num_episodes = 1000
for episode in range(num_episodes): state = env.reset() done = False total_reward = 0
env.close()