Unity-Technologies / ml-agents

The Unity Machine Learning Agents Toolkit (ML-Agents) is an open-source project that enables games and simulations to serve as environments for training intelligent agents using deep reinforcement learning and imitation learning.
https://unity.com/products/machine-learning-agents
Other
17.12k stars 4.15k forks source link

two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected #6133

Closed LiuWhale closed 2 months ago

LiuWhale commented 2 months ago

Describe the bug two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected

To Reproduce Steps to reproduce the behavior:

  1. open Example RollerBall https://unity-technologies.github.io/ml-agents/Learning-Environment-Create-New/
  2. add Target as agent
    
    using System.Collections;
    using System.Collections.Generic;
    using UnityEngine;
    using Unity.MLAgents;
    using Unity.MLAgents.Sensors;
    using Unity.MLAgents.Actuators;

public class ObstacleAgent : Agent { Transform m_Transform; // Start is called before the first frame update void Start() { m_Transform = GetComponent(); }

public Transform m_Sphere;
// Update is called once per episode
public override void OnEpisodeBegin()
{
    // Move the target to a new spot
    m_Transform.transform.localPosition = new Vector3(Random.value * 8 - 4,
                                       0.5f,
                                       Random.value * 8 - 4);
}

// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
    sensor.AddObservation(m_Transform.transform.localPosition);
    sensor.AddObservation(m_Sphere.localPosition);
    sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.x);
    sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.z);
}

// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
    m_Sphere.position = new Vector3(actionBuffers.ContinuousActions[0], m_Sphere.position.y, actionBuffers.ContinuousActions[1]);

    // Rewards
    float distanceToTarget = Vector3.Distance(this.transform.localPosition, m_Sphere.localPosition);
    SetReward(-0.01f);
    // Reached target
    if (distanceToTarget > 3.0f)
    {
        SetReward(1.0f);
        EndEpisode();
    }
    else
    {
        SetReward(-1.0f);
        EndEpisode();
    }
}

}

3. Change Ball Agent

using System.Collections; using System.Collections.Generic; using UnityEngine; using Unity.MLAgents; using Unity.MLAgents.Sensors; using Unity.MLAgents.Actuators;

public class RollerAgent : Agent { Rigidbody rBody; // Start is called before the first frame update void Start() { rBody = GetComponent(); }

public Transform Target;
// Update is called once per episode
public override void OnEpisodeBegin()
{
    if (this.transform.localPosition.y < 0)
    {
        this.rBody.angularVelocity = Vector3.zero;
        this.rBody.velocity = Vector3.zero;
        this.transform.localPosition = new Vector3( 0, 0.5f, 0);
    }
}

// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
    sensor.AddObservation(Target.localPosition);
    sensor.AddObservation(this.transform.localPosition);
    sensor.AddObservation(rBody.velocity.x);
    sensor.AddObservation(rBody.velocity.z);
}

// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
    Vector3 controlSignal = Vector3.zero;
    controlSignal.x = actionBuffers.ContinuousActions[0];
    controlSignal.z = actionBuffers.ContinuousActions[1];
    rBody.AddForce(controlSignal * 10);

    // Rewards
    float distanceToTarget = Vector3.Distance(this.transform.localPosition, Target.localPosition);
    SetReward(-0.01f);
    // Reached target
    if (distanceToTarget < 1.42f)
    {
        SetReward(1.0f);
        EndEpisode();
    }

    // Fell off platform
    else if (this.transform.localPosition.y < 0)
    {
        SetReward(-1.0f);
        EndEpisode();
    }
}

}

4. train.py as below

import argparse import numpy as np import torch import TD3 import utils from typing import Dict, List from mlagents_envs.environment import ActionTuple from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

def get_states(env, behaviors, indexs): observation : Dict[str, List[np.ndarray]] = {} reward : Dict[str, float] = {} terminated : Dict[str, bool] = {} for behavior_name in behaviors: decision_steps, terminal_steps = env.get_steps(behavior_name) for agent_id_decisions in decision_steps: observation[behavior_name] = decision_steps[agent_id_decisions].obs[indexs[behavior_name]] reward[behavior_name] = decision_steps[agent_id_decisions].reward terminated[behavior_name] = False for agent_id_terminated in terminal_steps: observation[behavior_name] = terminal_steps[agent_id_terminated].obs[indexs[behavior_name]] reward[behavior_name] = terminal_steps[agent_id_terminated].reward terminated[behavior_name] = not terminal_steps[agent_id_terminated].interrupted return observation, reward, terminated

if name == "main": parser = argparse.ArgumentParser(description='Training arguments') parser.add_argument('--no-render', action='store_true', default=False, help='Render the environment') parser.add_argument('--seed', type=int, default=0, help='Seed for random number generator') parser.add_argument('--time-scale', type=float, default=20.0, help='Time scale for unity') parser.add_argument('--quality-level', type=int, default=0, help='Quality level for unity') args = parser.parse_args()

try:
    env.close()
except:
    pass
# action mapping function 
# am = ActionMappingClass()
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name="/home/whale/下载/UnityBuild/USV.x86_64", \
    seed=args.seed, no_graphics=args.no_render, side_channels=[channel], \
    base_port=5415)
channel.set_configuration_parameters(time_scale = args.time_scale, quality_level=args.quality_level)
print("USV environment created.")
env.reset()

# set parameters
behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]
state_dim = spec.observation_specs[0].shape[0]
action_dim = spec.action_spec.continuous_size
max_action = 1

# obstacle
obs_behavior_name = list(env.behavior_specs)[1]
obs_spec = env.behavior_specs[obs_behavior_name]
obs_state_dim = obs_spec.observation_specs[0].shape[0]
obs_action_dim = obs_spec.action_spec.continuous_size
obs_max_action = 1

args = {
    'start_timesteps':1e4, 
    'eval_freq': 5e3,
    'expl_noise': 0.1, 
    'batch_size': 256,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip': 0.5,
    'policy_freq': 2   # was 2
}

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": args['discount'],
    "tau": args['tau'],
}

# Target policy smoothing is scaled wrt the action scale
kwargs["policy_noise"] = args['policy_noise'] * max_action
kwargs["noise_clip"] = args['noise_clip'] * max_action
kwargs["policy_freq"] = args['policy_freq']
policy = TD3.TD3(**kwargs)

obs_kwargs = {
    "state_dim": obs_state_dim,
    "action_dim": obs_action_dim,
    "max_action": obs_max_action,
    "discount": args['discount'],
    "tau": args['tau'],
}

# Target policy smoothing is scaled wrt the action scale
obs_kwargs["policy_noise"] = args['policy_noise'] * obs_max_action
obs_kwargs["noise_clip"] = args['noise_clip'] * obs_max_action
obs_kwargs["policy_freq"] = args['policy_freq']
obs_policy = TD3.TD3(**obs_kwargs)

try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, max_size=int(5e6))
    obs_replay_buffer = utils.ReplayBuffer(obs_state_dim, obs_action_dim, max_size=int(5e6))
    # set training counters
    stepcounter = 0
    traincounter = 1
    savecounter = 1
    trainlog = []
    finished_count = 0
    collision_count = 0
    agent_max_step = 5

    behavior_names = list(env.behavior_specs)
    indexs = {behavior_names[0]: 0, behavior_names[1]: 0}
    for episode in range(100000):
        env.reset()
        ob, r, done = get_states(env, behavior_names, indexs)
        done = False
        saved = False
        episode_reward = 0
        obs_episode_reward = 0
        episode_timesteps = 0
        agent_step = 0
        for step in range(100000):
            stepcounter += 1
            # generate action for usv and obsatcle
            if stepcounter < args['start_timesteps']:
                action = np.random.uniform(-max_action, max_action, action_dim)
                obs_action = np.random.uniform(-obs_max_action, obs_max_action, obs_action_dim)
            else:
                noise = np.random.normal(0, max_action * args['expl_noise'], size=action_dim)
                action = (policy.select_action(ob[behavior_name]) + noise).clip(-max_action, max_action)  # clip here
                obs_noise = np.random.normal(0, obs_max_action * args['expl_noise'], size=obs_action_dim)
                obs_action = (obs_policy.select_action(ob[obs_behavior_name]) + obs_noise).clip(-obs_max_action, obs_max_action)  # clip here

            # usv action
            action_in = np.array([action])
            action_tuple = ActionTuple()
            action_tuple.add_continuous(action_in)
            # obstacle action
            obs_action_in = np.array([obs_action])
            obs_action_tuple = ActionTuple()
            obs_action_tuple.add_continuous(obs_action_in)
            # set actions and perform
            env.set_actions(behavior_name, action_tuple)
            env.set_actions(obs_behavior_name, obs_action_tuple)
            env.step()
            # get states
            next_ob, r, done = get_states(env, behavior_names, indexs)
            # receive message
            if behavior_name in next_ob:
                # store replay buffer
                replay_buffer.add(ob[behavior_name], action, next_ob[behavior_name], r[behavior_name], done[behavior_name])
                ob[behavior_name] = next_ob[behavior_name]
                episode_reward += r[behavior_name]
                if done[behavior_name]: break
                if stepcounter > args['start_timesteps'] and agent_step < agent_max_step:
                    policy.train(replay_buffer, args['batch_size'])
                    traincounter += 1
            # obstacle
            if obs_behavior_name in next_ob:
                obs_replay_buffer.add(ob[obs_behavior_name], obs_action, next_ob[obs_behavior_name], r[obs_behavior_name], done[obs_behavior_name])
                ob[obs_behavior_name] = next_ob[obs_behavior_name]
                obs_episode_reward += r[obs_behavior_name]
                # check if episode is done
                # if done[obs_behavior_name]: break
                # train the model
                if stepcounter > args['start_timesteps'] and agent_step >= agent_max_step \
                    and agent_step < agent_max_step*2:
                    obs_policy.train(obs_replay_buffer, args['batch_size'])
                    agent_step = 0

            # deal the invalid data
            if len(next_ob)==0: 
                print("Both agent num is zero!!!")
                break
            # save TD3 model
            if traincounter % 100000 == 0 and not saved:
                policy.save('model_'+str(savecounter))
                obs_policy.save('obs_model_'+str(savecounter))
                savecounter += 1
                saved = True
                print('TD3AM model', savecounter, 'saved!')
        # End of Episode
        print('episode:', episode, 'reward:', episode_reward, 'obs_episode_reward:', obs_episode_reward, 'step:', step, \
            'finished_count:', finished_count, 'collision_count:', collision_count, 'stepcounter:',stepcounter, 'traincounter:', traincounter)
        # save reward log
        # if saved:
        #     np.save('trainlog.npy', trainlog)
except KeyboardInterrupt:
    print("\nTraining interrupted, continue to next cell to save to save the model.")
finally:
    env.close()

5. error shows out
**Console logs / stack traces**
<img width="740" alt="image" src="https://github.com/user-attachments/assets/bc3e791b-b392-45e4-a981-ddb84cfffb12">

That means agent will be not detected by python side when another agent get terminal step data.
**Environment:**
- Unity Version: [Unity 2023.2.12f1]
- OS + version: [Ubuntu22.04LTS]
- _ML-Agents version_: ml-agents 1.0.0
- _Torch version_: 2.2.1+cu121
- _Environment_: RollerBall
LiuWhale commented 2 months ago

https://github.com/Unity-Technologies/ml-agents/issues/5792 Over