two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected

Describe the bug two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected

To Reproduce Steps to reproduce the behavior:

open Example RollerBall https://unity-technologies.github.io/ml-agents/Learning-Environment-Create-New/

add Target as agent


using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.MLAgents;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Actuators;

public class ObstacleAgent : Agent { Transform m_Transform; // Start is called before the first frame update void Start() { m_Transform = GetComponent(); }

public Transform m_Sphere;
// Update is called once per episode
public override void OnEpisodeBegin()
{
    // Move the target to a new spot
    m_Transform.transform.localPosition = new Vector3(Random.value * 8 - 4,
                                       0.5f,
                                       Random.value * 8 - 4);
}

// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
    sensor.AddObservation(m_Transform.transform.localPosition);
    sensor.AddObservation(m_Sphere.localPosition);
    sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.x);
    sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.z);
}

// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
    m_Sphere.position = new Vector3(actionBuffers.ContinuousActions[0], m_Sphere.position.y, actionBuffers.ContinuousActions[1]);

    // Rewards
    float distanceToTarget = Vector3.Distance(this.transform.localPosition, m_Sphere.localPosition);
    SetReward(-0.01f);
    // Reached target
    if (distanceToTarget > 3.0f)
    {
        SetReward(1.0f);
        EndEpisode();
    }
    else
    {
        SetReward(-1.0f);
        EndEpisode();
    }
}

}

3. Change Ball Agent

using System.Collections; using System.Collections.Generic; using UnityEngine; using Unity.MLAgents; using Unity.MLAgents.Sensors; using Unity.MLAgents.Actuators;

public class RollerAgent : Agent { Rigidbody rBody; // Start is called before the first frame update void Start() { rBody = GetComponent(); }

public Transform Target;
// Update is called once per episode
public override void OnEpisodeBegin()
{
    if (this.transform.localPosition.y < 0)
    {
        this.rBody.angularVelocity = Vector3.zero;
        this.rBody.velocity = Vector3.zero;
        this.transform.localPosition = new Vector3( 0, 0.5f, 0);
    }
}

// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
    sensor.AddObservation(Target.localPosition);
    sensor.AddObservation(this.transform.localPosition);
    sensor.AddObservation(rBody.velocity.x);
    sensor.AddObservation(rBody.velocity.z);
}

// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
    Vector3 controlSignal = Vector3.zero;
    controlSignal.x = actionBuffers.ContinuousActions[0];
    controlSignal.z = actionBuffers.ContinuousActions[1];
    rBody.AddForce(controlSignal * 10);

    // Rewards
    float distanceToTarget = Vector3.Distance(this.transform.localPosition, Target.localPosition);
    SetReward(-0.01f);
    // Reached target
    if (distanceToTarget < 1.42f)
    {
        SetReward(1.0f);
        EndEpisode();
    }

    // Fell off platform
    else if (this.transform.localPosition.y < 0)
    {
        SetReward(-1.0f);
        EndEpisode();
    }
}

}

4. train.py as below

import argparse import numpy as np import torch import TD3 import utils from typing import Dict, List from mlagents_envs.environment import ActionTuple from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

def get_states(env, behaviors, indexs): observation : Dict[str, List[np.ndarray]] = {} reward : Dict[str, float] = {} terminated : Dict[str, bool] = {} for behavior_name in behaviors: decision_steps, terminal_steps = env.get_steps(behavior_name) for agent_id_decisions in decision_steps: observation[behavior_name] = decision_steps[agent_id_decisions].obs[indexs[behavior_name]] reward[behavior_name] = decision_steps[agent_id_decisions].reward terminated[behavior_name] = False for agent_id_terminated in terminal_steps: observation[behavior_name] = terminal_steps[agent_id_terminated].obs[indexs[behavior_name]] reward[behavior_name] = terminal_steps[agent_id_terminated].reward terminated[behavior_name] = not terminal_steps[agent_id_terminated].interrupted return observation, reward, terminated

if name == "main": parser = argparse.ArgumentParser(description='Training arguments') parser.add_argument('--no-render', action='store_true', default=False, help='Render the environment') parser.add_argument('--seed', type=int, default=0, help='Seed for random number generator') parser.add_argument('--time-scale', type=float, default=20.0, help='Time scale for unity') parser.add_argument('--quality-level', type=int, default=0, help='Quality level for unity') args = parser.parse_args()

try:
    env.close()
except:
    pass
# action mapping function 
# am = ActionMappingClass()
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name="/home/whale/下载/UnityBuild/USV.x86_64", \
    seed=args.seed, no_graphics=args.no_render, side_channels=[channel], \
    base_port=5415)
channel.set_configuration_parameters(time_scale = args.time_scale, quality_level=args.quality_level)
print("USV environment created.")
env.reset()

# set parameters
behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]
state_dim = spec.observation_specs[0].shape[0]
action_dim = spec.action_spec.continuous_size
max_action = 1

# obstacle
obs_behavior_name = list(env.behavior_specs)[1]
obs_spec = env.behavior_specs[obs_behavior_name]
obs_state_dim = obs_spec.observation_specs[0].shape[0]
obs_action_dim = obs_spec.action_spec.continuous_size
obs_max_action = 1

args = {
    'start_timesteps':1e4, 
    'eval_freq': 5e3,
    'expl_noise': 0.1, 
    'batch_size': 256,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip': 0.5,
    'policy_freq': 2   # was 2
}

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": args['discount'],
    "tau": args['tau'],
}

# Target policy smoothing is scaled wrt the action scale
kwargs["policy_noise"] = args['policy_noise'] * max_action
kwargs["noise_clip"] = args['noise_clip'] * max_action
kwargs["policy_freq"] = args['policy_freq']
policy = TD3.TD3(**kwargs)

obs_kwargs = {
    "state_dim": obs_state_dim,
    "action_dim": obs_action_dim,
    "max_action": obs_max_action,
    "discount": args['discount'],
    "tau": args['tau'],
}

# Target policy smoothing is scaled wrt the action scale
obs_kwargs["policy_noise"] = args['policy_noise'] * obs_max_action
obs_kwargs["noise_clip"] = args['noise_clip'] * obs_max_action
obs_kwargs["policy_freq"] = args['policy_freq']
obs_policy = TD3.TD3(**obs_kwargs)

try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, max_size=int(5e6))
    obs_replay_buffer = utils.ReplayBuffer(obs_state_dim, obs_action_dim, max_size=int(5e6))
    # set training counters
    stepcounter = 0
    traincounter = 1
    savecounter = 1
    trainlog = []
    finished_count = 0
    collision_count = 0
    agent_max_step = 5

    behavior_names = list(env.behavior_specs)
    indexs = {behavior_names[0]: 0, behavior_names[1]: 0}
    for episode in range(100000):
        env.reset()
        ob, r, done = get_states(env, behavior_names, indexs)
        done = False
        saved = False
        episode_reward = 0
        obs_episode_reward = 0
        episode_timesteps = 0
        agent_step = 0
        for step in range(100000):
            stepcounter += 1
            # generate action for usv and obsatcle
            if stepcounter < args['start_timesteps']:
                action = np.random.uniform(-max_action, max_action, action_dim)
                obs_action = np.random.uniform(-obs_max_action, obs_max_action, obs_action_dim)
            else:
                noise = np.random.normal(0, max_action * args['expl_noise'], size=action_dim)
                action = (policy.select_action(ob[behavior_name]) + noise).clip(-max_action, max_action)  # clip here
                obs_noise = np.random.normal(0, obs_max_action * args['expl_noise'], size=obs_action_dim)
                obs_action = (obs_policy.select_action(ob[obs_behavior_name]) + obs_noise).clip(-obs_max_action, obs_max_action)  # clip here

            # usv action
            action_in = np.array([action])
            action_tuple = ActionTuple()
            action_tuple.add_continuous(action_in)
            # obstacle action
            obs_action_in = np.array([obs_action])
            obs_action_tuple = ActionTuple()
            obs_action_tuple.add_continuous(obs_action_in)
            # set actions and perform
            env.set_actions(behavior_name, action_tuple)
            env.set_actions(obs_behavior_name, obs_action_tuple)
            env.step()
            # get states
            next_ob, r, done = get_states(env, behavior_names, indexs)
            # receive message
            if behavior_name in next_ob:
                # store replay buffer
                replay_buffer.add(ob[behavior_name], action, next_ob[behavior_name], r[behavior_name], done[behavior_name])
                ob[behavior_name] = next_ob[behavior_name]
                episode_reward += r[behavior_name]
                if done[behavior_name]: break
                if stepcounter > args['start_timesteps'] and agent_step < agent_max_step:
                    policy.train(replay_buffer, args['batch_size'])
                    traincounter += 1
            # obstacle
            if obs_behavior_name in next_ob:
                obs_replay_buffer.add(ob[obs_behavior_name], obs_action, next_ob[obs_behavior_name], r[obs_behavior_name], done[obs_behavior_name])
                ob[obs_behavior_name] = next_ob[obs_behavior_name]
                obs_episode_reward += r[obs_behavior_name]
                # check if episode is done
                # if done[obs_behavior_name]: break
                # train the model
                if stepcounter > args['start_timesteps'] and agent_step >= agent_max_step \
                    and agent_step < agent_max_step*2:
                    obs_policy.train(obs_replay_buffer, args['batch_size'])
                    agent_step = 0

            # deal the invalid data
            if len(next_ob)==0: 
                print("Both agent num is zero!!!")
                break
            # save TD3 model
            if traincounter % 100000 == 0 and not saved:
                policy.save('model_'+str(savecounter))
                obs_policy.save('obs_model_'+str(savecounter))
                savecounter += 1
                saved = True
                print('TD3AM model', savecounter, 'saved!')
        # End of Episode
        print('episode:', episode, 'reward:', episode_reward, 'obs_episode_reward:', obs_episode_reward, 'step:', step, \
            'finished_count:', finished_count, 'collision_count:', collision_count, 'stepcounter:',stepcounter, 'traincounter:', traincounter)
        # save reward log
        # if saved:
        #     np.save('trainlog.npy', trainlog)
except KeyboardInterrupt:
    print("\nTraining interrupted, continue to next cell to save to save the model.")
finally:
    env.close()


5. error shows out
**Console logs / stack traces**
<img width="740" alt="image" src="https://github.com/user-attachments/assets/bc3e791b-b392-45e4-a981-ddb84cfffb12">

That means agent will be not detected by python side when another agent get terminal step data.
**Environment:**
- Unity Version: [Unity 2023.2.12f1]
- OS + version: [Ubuntu22.04LTS]
- _ML-Agents version_: ml-agents 1.0.0
- _Torch version_: 2.2.1+cu121
- _Environment_: RollerBall

Unity-Technologies / ml-agents

two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected #6133