Farama-Foundation / PettingZoo

An API standard for multi-agent reinforcement learning environments, with popular reference environments and related utilities
https://pettingzoo.farama.org
Other
2.61k stars 411 forks source link

[Bug Report] pettingzoo.atari pong_v3 pong reward need >0 #1008

Closed xiezhipeng-git closed 1 year ago

xiezhipeng-git commented 1 year ago

Describe the bug

pettingzoo.atari pong_v3 reward need >0 Name: pettingzoo Version: 1.23.1

Code example

import random
import numpy as np

from pettingzoo.atari import (
    basketball_pong_v3,
    boxing_v2,
    combat_plane_v2,
    combat_tank_v2,
    double_dunk_v3,
    entombed_cooperative_v3,
    flag_capture_v2,
    foozpong_v3,
    ice_hockey_v2,
    joust_v3,
    mario_bros_v3,
    maze_craze_v3,
    othello_v3,
    pong_v3,
    quadrapong_v4,
    space_invaders_v2,
    space_war_v2,
    surround_v2,
    tennis_v3,
    video_checkers_v4,
    volleyball_pong_v3,
    warlords_v3,
    wizard_of_wor_v3,
)
from pettingzoo.butterfly import (
    cooperative_pong_v5,
    knights_archers_zombies_v10,
    pistonball_v6,
)
from pettingzoo.classic import (
    chess_v5,
    connect_four_v3,
    gin_rummy_v4,
    go_v5,
    hanabi_v4,
    leduc_holdem_v4,
    rps_v2,
    texas_holdem_no_limit_v6,
    texas_holdem_v4,
    tictactoe_v3,
)
from pettingzoo.mpe import (
    simple_adversary_v3,
    simple_crypto_v3,
    simple_push_v3,
    simple_reference_v3,
    simple_speaker_listener_v4,
    simple_spread_v3,
    simple_tag_v3,
    simple_v3,
    simple_world_comm_v3,
)
from pettingzoo.sisl import multiwalker_v9, pursuit_v4, waterworld_v4

all_prefixes = ["atari", "classic", "butterfly", "mpe", "sisl"]

manual_environments = {
    "butterfly/knights_archers_zombies",
    "butterfly/pistonball",
    "butterfly/cooperative_pong",
    "sisl/pursuit",
}

all_environments = {
    "atari/basketball_pong_v3": basketball_pong_v3,
    "atari/boxing_v2": boxing_v2,
    "atari/combat_tank_v2": combat_tank_v2,
    "atari/combat_plane_v2": combat_plane_v2,
    "atari/double_dunk_v3": double_dunk_v3,
    "atari/entombed_cooperative_v3": entombed_cooperative_v3,
    "atari/flag_capture_v2": flag_capture_v2,
    "atari/foozpong_v3": foozpong_v3,
    "atari/joust_v3": joust_v3,
    "atari/ice_hockey_v2": ice_hockey_v2,
    "atari/maze_craze_v3": maze_craze_v3,
    "atari/mario_bros_v3": mario_bros_v3,
    "atari/othello_v3": othello_v3,
    "atari/pong_v3": pong_v3,
    "atari/quadrapong_v4": quadrapong_v4,
    "atari/space_invaders_v2": space_invaders_v2,
    "atari/space_war_v2": space_war_v2,
    "atari/surround_v2": surround_v2,
    "atari/tennis_v3": tennis_v3,
    "atari/video_checkers_v4": video_checkers_v4,
    "atari/volleyball_pong_v3": volleyball_pong_v3,
    "atari/wizard_of_wor_v3": wizard_of_wor_v3,
    "atari/warlords_v3": warlords_v3,
    "classic/chess_v5": chess_v5,
    "classic/rps_v2": rps_v2,
    "classic/connect_four_v3": connect_four_v3,
    "classic/tictactoe_v3": tictactoe_v3,
    "classic/leduc_holdem_v4": leduc_holdem_v4,
    "classic/texas_holdem_v4": texas_holdem_v4,
    "classic/texas_holdem_no_limit_v6": texas_holdem_no_limit_v6,
    "classic/gin_rummy_v4": gin_rummy_v4,
    "classic/go_v5": go_v5,
    "classic/hanabi_v4": hanabi_v4,
    "butterfly/knights_archers_zombies_v10": knights_archers_zombies_v10,
    "butterfly/pistonball_v6": pistonball_v6,
    "butterfly/cooperative_pong_v5": cooperative_pong_v5,
    "mpe/simple_adversary_v3": simple_adversary_v3,
    "mpe/simple_crypto_v3": simple_crypto_v3,
    "mpe/simple_push_v3": simple_push_v3,
    "mpe/simple_reference_v3": simple_reference_v3,
    "mpe/simple_speaker_listener_v4": simple_speaker_listener_v4,
    "mpe/simple_spread_v3": simple_spread_v3,
    "mpe/simple_tag_v3": simple_tag_v3,
    "mpe/simple_world_comm_v3": simple_world_comm_v3,
    "mpe/simple_v3": simple_v3,
    "sisl/multiwalker_v9": multiwalker_v9,
    "sisl/waterworld_v4": waterworld_v4,
    "sisl/pursuit_v4": pursuit_v4,
}

def getRandomeActions(env,observations,terminations,truncations):
    #  actions = {
    #         agent: env.action_space(agent).sample()
    #         for agent in env.agents
    #         if not (terminations[agent] or truncations[agent])
    #     }
    actions = {}
    observations = {}
    for agent in env.agents:
        if terminations[agent] or truncations[agent]:
                # action = None
            print("有结束的")
            pass
        obs = None
        if observations!=None and agent in observations:
            obs = observations[agent]
        else:
            if hasattr(env,"observe"):
                obs = env.observe(agent)
        #

        if isinstance(obs, dict) and "action_mask" in obs:
            if obs["action_mask"].max()!=0:
                action = random.choice(np.flatnonzero(obs["action_mask"]).tolist())
                actions.update({agent:action})
            else:
                action = None
                actions.update({agent:action})
        else:
            action = env.action_space(agent).sample()
            actions.update({agent:action})
        observations.update({agent:obs})
    return actions,observations
def getRandomAndStep(env,step=0):
    rewards = {}
    terminations = {}
    truncations = {}
    for agent in env.agent_iter():

        obs, reward, termination, truncation, _ = env.last()
        if termination or truncation:
            action = None

        elif isinstance(obs, dict) and "action_mask" in obs:
            action = random.choice(np.flatnonzero(obs["action_mask"]).tolist())
        else:
            action = env.action_space(agent).sample()
        # print(len(env.agents),termination,truncation)
        # if step==0:
        #     action=3565
        env.step(action)
        # obs, reward, termination, truncation, _ = env.last()
        # # print(len(env.agents),agent,termination,truncation,reward)
        # if termination or truncation:
        #     action = None
        #     break
        rewards.update({agent:reward})
        terminations.update({agent:termination})
        truncations.update({agent:truncation})
    return rewards,terminations,truncations

def random_demo(env, render=True, episodes=1,parallel = True):
    """Runs an env object with random actions."""
    total_reward = 0
    completed_episodes = 0
    lastAgentNum = 1
    # 统一使用getRandomeActions parallel
    # parallel = True
    while completed_episodes < episodes:
        observations = env.reset()

        # nowNum = 0
        # for agent in env.agent_iter():
        #     nowNum+=1
        # if nowNum!=lastAgentNum:
        #     print("智能体个数变更",nowNum,lastAgentNum)
        #     lastAgentNum = nowNum
        terminations = {agent: False for agent in env.possible_agents}
        truncations = {agent: False for agent in env.possible_agents}
        rewards = None
        test_cycles = (
            2000
        )  # allows environment to do more than max_cycles if it so wishes
        if render:
                env.render()
        for step in range(test_cycles):
            if parallel:
              # 观察空间也在这里.可能会是轮流的观察空间。需要注意
                actions,observations = getRandomeActions(env,observations,truncations,truncations)
                #
                if hasattr(env,"agent_selection"):
                    action = actions[env.agent_selection]
                    env.step(action)
                    rewards = env.env.rewards
                    terminations = env.env.terminations
                    truncations = env.env.truncations
                    infos = env.env.infos
                else:
                    observations, rewards, terminations, truncations, infos = env.step(actions)
                if rewards!= None:
                    for agent in rewards:
                        # print(agent, rewards[agent])
                        total_reward+=rewards[agent]/float(len(rewards))
                # terminations 相当于done
                if all([x or y for x, y in zip(terminations.values(), truncations.values())]):
                    print("提前结束")
                    break
            else:
                # 观察空间也在这里.可能会是轮流的观察空间。需要注意
                rewards,termination,truncations = getRandomAndStep(env,step)
                if rewards!= None:
                    for agent in rewards:
                        # print(agent, rewards[agent])
                        total_reward+=rewards[agent]/float(len(rewards))
                if all([x or y for x, y in zip(terminations.values(), truncations.values())]):
                    break

        if rewards!= None:
            print("第",completed_episodes,"轮:",rewards, total_reward)

        completed_episodes += 1

    if render:
        env.close()

    print("Average total reward", total_reward / episodes)

    return total_reward

if __name__ == "__main__":
    # pip install autorom
    # AutoROM

    # Using environments in PettingZoo is very similar to Gym, i.e. you initialize an environment via:
    # model = multiwalker_v9
    model = pong_v3

    # model = chess_v5
    # model = all_environments["atari/foozpong_v3"]
    # model = all_environments["sisl/multiwalker_v9"]

    # model = all_environments["atari/mario_bros_v3"]

    # from pettingzoo.butterfly import pistonball_v6
    # env = pistonball_v6.env(render_mode="human")
    # env = chess_v5.env(render_mode="human")
    # env = multiwalker_v9.env(render_mode="human")
    if hasattr(model,"parallel_env"):
        env = model.parallel_env(render_mode="human")
        parallel = True
    else:
        env = model.env(render_mode="human")
        parallel = False
        # 暂时都使用,parallel
    random_demo(env, render=True, episodes=10,parallel = parallel)

    # random_demo(env, render=True, episodes=10)

System info

Describe how PettingZoo was installed (pip, source, ...) pip Version of (by pettingzoopettingzoo.version) 1.23.1 What OS/version you're using. Note that while we will accept PRs to improve Window's support, we do not officially support it. wsl:ubuntu-22.04 (windows11) Python version 3.10.6

Additional context

No response

Checklist

elliottower commented 1 year ago

Hi, thanks for the bug report. Your code is a bit complicated, so I tried to reproduce the problem in as simple an example as possible, and I found the rewards were working fine. The following code prints any rewards which are nonzero.

env = pong_v3.env()
env.reset(seed=42)

turn = 0

for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()

    if reward != 0:
        print(f"TURN {turn} | {agent}: {reward}")

    if termination or truncation:
        break

    else:
        if "action_mask" in info:
            mask = info["action_mask"]
        elif isinstance(observation, dict) and "action_mask" in observation:
            mask = observation["action_mask"]
        else:
            mask = None

        action = env.action_space(agent).sample(mask) # this is where you would insert your policy

    env.step(action)
    turn +=1
env.close()

This should have an output similar to this:

TURN 266 | first_0: 1 TURN 267 | second_0: -1 TURN 548 | first_0: 1 TURN 549 | second_0: -1 TURN 830 | first_0: 1 TURN 831 | second_0: -1

However, I see that our average_total_reward utility doesn't actually work with zero sum games, which is most of the environments, so I think I will make a PR adding a utility to track the scores and return it after running games. As a basic way to track scores, try this code:

env = pong_v3.env()
env.reset(seed=42)

total_rewards = {agent: 0 for agent in env.agents}

for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()

    # When the enemy scores, your agent gets -1 return, but the total score for your agent stays the same
    # The environment ends when one player reaches 20 points
    if reward > 0:
        total_rewards[agent] += reward

    if termination or truncation:
        print(f"Final rewards: {total_rewards}")
        break

    else:
        if "action_mask" in info:
            mask = info["action_mask"]
        elif isinstance(observation, dict) and "action_mask" in observation:
            mask = observation["action_mask"]
        else:
            mask = None

        action = env.action_space(agent).sample(mask) # this is where you would insert your policy

    env.step(action)
env.close()

This will tally up all scores received by agents over the course of the game:

Final rewards: {'first_0': 14, 'second_0': 20}

xiezhipeng-git commented 1 year ago

I found that the cause of the problem was test_cycles = 2000. 2000 cycles were not enough for pong to complete a round, but when I changed it to 5000, I saw rewards(1,-1). I assumed that the rewards should be the same as the displayed numbers. Thus, I thought the rewards were always zero. In that case, let's close this issue.

xiezhipeng-git commented 1 year ago

The cause of the problem has been found, but it is indeed a problem. Because if a pong game needs to complete 1 round to receive rewards of 1 and -1. So the training of this pong game would have to be twice as complex as n squared, so I reopened this issue

elliottower commented 1 year ago

The cause of the problem has been found, but it is indeed a problem. Because if a pong game needs to complete 1 round to receive rewards of 1 and -1. So the training of this pong game would have to be twice as complex as n squared, so I reopened this issue

Not sure where you're getting N squared for training, but I think you're misunderstanding how the rewards work.

Running the code that I sent, you can put a breakpoint inside fhe if termination or truncation: block, and view it with human rendering, and you will see that it plays a single round to 20 points, and each time an agent scores the rewards for that timestep will be 1 for the agent who scored and -1 for the other agent. We know this is true because my code adds these individual rewards in order to calculate the total reward, and the total reward is the same as you see on the screen. Here are some screenshots just to show what I mean: msrdc_qkDjxnO9n7 pycharm64_XQNBoi1xUT

The last reward value which you see will just be whoever scores the winning point (reward in the above screenshot), so you are correct that when it's complete it will be rewards of (1, -1), but you are incorrect that there is only a single reward at the end of the round--as I said above, each point that is scored results in a reward of (-1, 1) or vice versa.

xiezhipeng-git commented 1 year ago

@elliottower You are right.During the process, there will be rewards for those who score. The mistake is that this is not a zero sum game. A score of (1,0) is reasonable. Okay, let's change the title

elliottower commented 1 year ago

Talked with some other devs internally and the way it currently works is as intended. The reward allows the agent to tell between three possible scenarios: you score (+1), nobody scores (0) and the opponent scores (-1). Two players cannot score in the same timestep, so there are no other options to consider. If the rewards returned were (1, 0) rather than (1, -1), then agents would have no way of telling if the opponent scored. Giving a negative reward allows negative feedback so they can learn to not let them score again in the future.

As a side note, the way you calculate the final score is entirely up to you. You could tally up the points and only count the game as a 1 if your agent wins, and -1 if they lose. Or you could use the actual values, calculated by tallying like I showed above (e.g., 20 points to 12 points). I am considering copying this wrapper into PettingZoo so we have some better utilities to track statistics and total rewards and such, if you think this would be helpful please let me know.

xiezhipeng-git commented 1 year ago

This wrapper is useful. However, the reason I was initially confused is that the scores displayed in Pong are inconsistent with the scores provided by you. For AI learning, it seems that (+1, -1) and (+1, 0) are considered the same because during training, only a single reward value is provided and the positive or negative aspect of the reward has little impact(This requires conducting dedicated experiments for validation.). Therefore, I still believe that aligning the rewards with the scores displayed in the game would be the best approach. However, the current solution is also acceptable.

elliottower commented 1 year ago

This wrapper is useful. However, the reason I was initially confused is that the scores displayed in Pong are inconsistent with the scores provided by you. For AI learning, it seems that (+1, -1) and (+1, 0) are considered the same because during training, only a single reward value is provided and the positive or negative aspect of the reward has little impact(This requires conducting dedicated experiments for validation.). Therefore, I still believe that aligning the rewards with the scores displayed in the game would be the best approach. However, the current solution is also acceptable.

Are you suggesting that the scores be returned as the totals? So when the final point is scored, winning agent receives a reward of 20? That would likely mess with the learning, as the scale of rewards would change so much over the course of the game. Otherwise if you are suggesting that the rewards be (+1, 0) rather than (+1, -1), we could do that, but I see no strong reason to, at best it's the same and at worst it's losing some information. I'll also note that if you use the parallel wrappers, the rewards are returned for both agents each timestep, so having (+1, -1) makes the most sense for that scenario.

And yeah I was going to (or already did?) link that wrapper, I am planning to implement that as something in PettingZoo as well. I talked with the gymnasium devs a bit about it and how it should be implemented. If you're interested in helping out with that we're always looking for new contributors as well.

Appreciate you opening the issue, definitely good to take a look into things like this and ensure they all make sense and are done the right way.

xiezhipeng-git commented 1 year ago

Otherwise if you are suggesting that the rewards be (+1, 0) rather than (+1, -1), we could do that, but I see no strong reason to, at best it's the same and at worst it's losing some information. I'll also note that if you use the parallel wrappers, the rewards are returned for both agents each timestep, so having (+1, -1) makes the most sense for that scenario.

This sentence makes a lot of sense. I originally thought it would be better to add (1,0) and add it yourself to match the display. But what you said seems more reasonable