The Unity Machine Learning Agents Toolkit (ML-Agents) is an open-source project that enables games and simulations to serve as environments for training intelligent agents using deep reinforcement learning and imitation learning.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.MLAgents;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Actuators;
public class ObstacleAgent : Agent
{
Transform m_Transform;
// Start is called before the first frame update
void Start()
{
m_Transform = GetComponent();
}
public Transform m_Sphere;
// Update is called once per episode
public override void OnEpisodeBegin()
{
// Move the target to a new spot
m_Transform.transform.localPosition = new Vector3(Random.value * 8 - 4,
0.5f,
Random.value * 8 - 4);
}
// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
sensor.AddObservation(m_Transform.transform.localPosition);
sensor.AddObservation(m_Sphere.localPosition);
sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.x);
sensor.AddObservation(m_Sphere.GetComponent<Rigidbody>().velocity.z);
}
// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
m_Sphere.position = new Vector3(actionBuffers.ContinuousActions[0], m_Sphere.position.y, actionBuffers.ContinuousActions[1]);
// Rewards
float distanceToTarget = Vector3.Distance(this.transform.localPosition, m_Sphere.localPosition);
SetReward(-0.01f);
// Reached target
if (distanceToTarget > 3.0f)
{
SetReward(1.0f);
EndEpisode();
}
else
{
SetReward(-1.0f);
EndEpisode();
}
}
}
3. Change Ball Agent
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.MLAgents;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Actuators;
public class RollerAgent : Agent
{
Rigidbody rBody;
// Start is called before the first frame update
void Start()
{
rBody = GetComponent();
}
public Transform Target;
// Update is called once per episode
public override void OnEpisodeBegin()
{
if (this.transform.localPosition.y < 0)
{
this.rBody.angularVelocity = Vector3.zero;
this.rBody.velocity = Vector3.zero;
this.transform.localPosition = new Vector3( 0, 0.5f, 0);
}
}
// Collect all the observations
public override void CollectObservations(VectorSensor sensor)
{
sensor.AddObservation(Target.localPosition);
sensor.AddObservation(this.transform.localPosition);
sensor.AddObservation(rBody.velocity.x);
sensor.AddObservation(rBody.velocity.z);
}
// Update is called once per frame
public override void OnActionReceived(ActionBuffers actionBuffers)
{
Vector3 controlSignal = Vector3.zero;
controlSignal.x = actionBuffers.ContinuousActions[0];
controlSignal.z = actionBuffers.ContinuousActions[1];
rBody.AddForce(controlSignal * 10);
// Rewards
float distanceToTarget = Vector3.Distance(this.transform.localPosition, Target.localPosition);
SetReward(-0.01f);
// Reached target
if (distanceToTarget < 1.42f)
{
SetReward(1.0f);
EndEpisode();
}
// Fell off platform
else if (this.transform.localPosition.y < 0)
{
SetReward(-1.0f);
EndEpisode();
}
}
}
4. train.py as below
import argparse
import numpy as np
import torch
import TD3
import utils
from typing import Dict, List
from mlagents_envs.environment import ActionTuple
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
def get_states(env, behaviors, indexs):
observation : Dict[str, List[np.ndarray]] = {}
reward : Dict[str, float] = {}
terminated : Dict[str, bool] = {}
for behavior_name in behaviors:
decision_steps, terminal_steps = env.get_steps(behavior_name)
for agent_id_decisions in decision_steps:
observation[behavior_name] = decision_steps[agent_id_decisions].obs[indexs[behavior_name]]
reward[behavior_name] = decision_steps[agent_id_decisions].reward
terminated[behavior_name] = False
for agent_id_terminated in terminal_steps:
observation[behavior_name] = terminal_steps[agent_id_terminated].obs[indexs[behavior_name]]
reward[behavior_name] = terminal_steps[agent_id_terminated].reward
terminated[behavior_name] = not terminal_steps[agent_id_terminated].interrupted
return observation, reward, terminated
if name == "main":
parser = argparse.ArgumentParser(description='Training arguments')
parser.add_argument('--no-render', action='store_true', default=False, help='Render the environment')
parser.add_argument('--seed', type=int, default=0, help='Seed for random number generator')
parser.add_argument('--time-scale', type=float, default=20.0, help='Time scale for unity')
parser.add_argument('--quality-level', type=int, default=0, help='Quality level for unity')
args = parser.parse_args()
try:
env.close()
except:
pass
# action mapping function
# am = ActionMappingClass()
channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name="/home/whale/下载/UnityBuild/USV.x86_64", \
seed=args.seed, no_graphics=args.no_render, side_channels=[channel], \
base_port=5415)
channel.set_configuration_parameters(time_scale = args.time_scale, quality_level=args.quality_level)
print("USV environment created.")
env.reset()
# set parameters
behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]
state_dim = spec.observation_specs[0].shape[0]
action_dim = spec.action_spec.continuous_size
max_action = 1
# obstacle
obs_behavior_name = list(env.behavior_specs)[1]
obs_spec = env.behavior_specs[obs_behavior_name]
obs_state_dim = obs_spec.observation_specs[0].shape[0]
obs_action_dim = obs_spec.action_spec.continuous_size
obs_max_action = 1
args = {
'start_timesteps':1e4,
'eval_freq': 5e3,
'expl_noise': 0.1,
'batch_size': 256,
'discount': 0.99,
'tau': 0.005,
'policy_noise': 0.2,
'noise_clip': 0.5,
'policy_freq': 2 # was 2
}
kwargs = {
"state_dim": state_dim,
"action_dim": action_dim,
"max_action": max_action,
"discount": args['discount'],
"tau": args['tau'],
}
# Target policy smoothing is scaled wrt the action scale
kwargs["policy_noise"] = args['policy_noise'] * max_action
kwargs["noise_clip"] = args['noise_clip'] * max_action
kwargs["policy_freq"] = args['policy_freq']
policy = TD3.TD3(**kwargs)
obs_kwargs = {
"state_dim": obs_state_dim,
"action_dim": obs_action_dim,
"max_action": obs_max_action,
"discount": args['discount'],
"tau": args['tau'],
}
# Target policy smoothing is scaled wrt the action scale
obs_kwargs["policy_noise"] = args['policy_noise'] * obs_max_action
obs_kwargs["noise_clip"] = args['noise_clip'] * obs_max_action
obs_kwargs["policy_freq"] = args['policy_freq']
obs_policy = TD3.TD3(**obs_kwargs)
try:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
replay_buffer = utils.ReplayBuffer(state_dim, action_dim, max_size=int(5e6))
obs_replay_buffer = utils.ReplayBuffer(obs_state_dim, obs_action_dim, max_size=int(5e6))
# set training counters
stepcounter = 0
traincounter = 1
savecounter = 1
trainlog = []
finished_count = 0
collision_count = 0
agent_max_step = 5
behavior_names = list(env.behavior_specs)
indexs = {behavior_names[0]: 0, behavior_names[1]: 0}
for episode in range(100000):
env.reset()
ob, r, done = get_states(env, behavior_names, indexs)
done = False
saved = False
episode_reward = 0
obs_episode_reward = 0
episode_timesteps = 0
agent_step = 0
for step in range(100000):
stepcounter += 1
# generate action for usv and obsatcle
if stepcounter < args['start_timesteps']:
action = np.random.uniform(-max_action, max_action, action_dim)
obs_action = np.random.uniform(-obs_max_action, obs_max_action, obs_action_dim)
else:
noise = np.random.normal(0, max_action * args['expl_noise'], size=action_dim)
action = (policy.select_action(ob[behavior_name]) + noise).clip(-max_action, max_action) # clip here
obs_noise = np.random.normal(0, obs_max_action * args['expl_noise'], size=obs_action_dim)
obs_action = (obs_policy.select_action(ob[obs_behavior_name]) + obs_noise).clip(-obs_max_action, obs_max_action) # clip here
# usv action
action_in = np.array([action])
action_tuple = ActionTuple()
action_tuple.add_continuous(action_in)
# obstacle action
obs_action_in = np.array([obs_action])
obs_action_tuple = ActionTuple()
obs_action_tuple.add_continuous(obs_action_in)
# set actions and perform
env.set_actions(behavior_name, action_tuple)
env.set_actions(obs_behavior_name, obs_action_tuple)
env.step()
# get states
next_ob, r, done = get_states(env, behavior_names, indexs)
# receive message
if behavior_name in next_ob:
# store replay buffer
replay_buffer.add(ob[behavior_name], action, next_ob[behavior_name], r[behavior_name], done[behavior_name])
ob[behavior_name] = next_ob[behavior_name]
episode_reward += r[behavior_name]
if done[behavior_name]: break
if stepcounter > args['start_timesteps'] and agent_step < agent_max_step:
policy.train(replay_buffer, args['batch_size'])
traincounter += 1
# obstacle
if obs_behavior_name in next_ob:
obs_replay_buffer.add(ob[obs_behavior_name], obs_action, next_ob[obs_behavior_name], r[obs_behavior_name], done[obs_behavior_name])
ob[obs_behavior_name] = next_ob[obs_behavior_name]
obs_episode_reward += r[obs_behavior_name]
# check if episode is done
# if done[obs_behavior_name]: break
# train the model
if stepcounter > args['start_timesteps'] and agent_step >= agent_max_step \
and agent_step < agent_max_step*2:
obs_policy.train(obs_replay_buffer, args['batch_size'])
agent_step = 0
# deal the invalid data
if len(next_ob)==0:
print("Both agent num is zero!!!")
break
# save TD3 model
if traincounter % 100000 == 0 and not saved:
policy.save('model_'+str(savecounter))
obs_policy.save('obs_model_'+str(savecounter))
savecounter += 1
saved = True
print('TD3AM model', savecounter, 'saved!')
# End of Episode
print('episode:', episode, 'reward:', episode_reward, 'obs_episode_reward:', obs_episode_reward, 'step:', step, \
'finished_count:', finished_count, 'collision_count:', collision_count, 'stepcounter:',stepcounter, 'traincounter:', traincounter)
# save reward log
# if saved:
# np.save('trainlog.npy', trainlog)
except KeyboardInterrupt:
print("\nTraining interrupted, continue to next cell to save to save the model.")
finally:
env.close()
5. error shows out
**Console logs / stack traces**
<img width="740" alt="image" src="https://github.com/user-attachments/assets/bc3e791b-b392-45e4-a981-ddb84cfffb12">
That means agent will be not detected by python side when another agent get terminal step data.
**Environment:**
- Unity Version: [Unity 2023.2.12f1]
- OS + version: [Ubuntu22.04LTS]
- _ML-Agents version_: ml-agents 1.0.0
- _Torch version_: 2.2.1+cu121
- _Environment_: RollerBall
Describe the bug two different agents with different behavior, one exec endepisode will trigger env reset, other agents will be affected
To Reproduce Steps to reproduce the behavior:
public class ObstacleAgent : Agent { Transform m_Transform; // Start is called before the first frame update void Start() { m_Transform = GetComponent();
}
}
using System.Collections; using System.Collections.Generic; using UnityEngine; using Unity.MLAgents; using Unity.MLAgents.Sensors; using Unity.MLAgents.Actuators;
public class RollerAgent : Agent { Rigidbody rBody; // Start is called before the first frame update void Start() { rBody = GetComponent();
}
}
import argparse import numpy as np import torch import TD3 import utils from typing import Dict, List from mlagents_envs.environment import ActionTuple from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
def get_states(env, behaviors, indexs): observation : Dict[str, List[np.ndarray]] = {} reward : Dict[str, float] = {} terminated : Dict[str, bool] = {} for behavior_name in behaviors: decision_steps, terminal_steps = env.get_steps(behavior_name) for agent_id_decisions in decision_steps: observation[behavior_name] = decision_steps[agent_id_decisions].obs[indexs[behavior_name]] reward[behavior_name] = decision_steps[agent_id_decisions].reward terminated[behavior_name] = False for agent_id_terminated in terminal_steps: observation[behavior_name] = terminal_steps[agent_id_terminated].obs[indexs[behavior_name]] reward[behavior_name] = terminal_steps[agent_id_terminated].reward terminated[behavior_name] = not terminal_steps[agent_id_terminated].interrupted return observation, reward, terminated
if name == "main": parser = argparse.ArgumentParser(description='Training arguments') parser.add_argument('--no-render', action='store_true', default=False, help='Render the environment') parser.add_argument('--seed', type=int, default=0, help='Seed for random number generator') parser.add_argument('--time-scale', type=float, default=20.0, help='Time scale for unity') parser.add_argument('--quality-level', type=int, default=0, help='Quality level for unity') args = parser.parse_args()