[Bug Report] Wandb sweep agent cuts in the thread of Isaac Sim before calling app.update(), which causes the process to hang forever

breadli428 commented 2 months ago

Describe the bug

Wandb sweep agent cuts in the thread of Isaac Sim before calling app.update(), which causes the process to hang forever.

Steps to reproduce

# Copyright (c) 2022-2023, The ORBIT Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Script to finetune RL agent with RSL-RL."""

from __future__ import annotations

"""Launch Isaac Sim Simulator first."""

import argparse
import os

from omni.isaac.orbit.app import AppLauncher

# local imports
import cli_args  # isort: skip

# add argparse arguments
parser = argparse.ArgumentParser(description="Finetune an RL agent with RSL-RL.")
parser.add_argument("--video", action="store_true", default=False, help="Record videos during finetuning.")
parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
parser.add_argument("--cpu", action="store_true", default=False, help="Use CPU pipeline.")
parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
parser.add_argument("--task", type=str, default=None, help="Name of the task.")
parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
parser.add_argument("--run_num", type=int, default=None, help="Run number for the experiment on the cluster.")
# append RSL-RL cli arguments
cli_args.add_rsl_rl_args(parser)
# append AppLauncher cli args
AppLauncher.add_app_launcher_args(parser)
args_cli = parser.parse_args()

# overwrite args for cluster training
args_cli.headless = True
args_cli.task = "Isaac-Velocity-Flat-Anymal-D-Finetune-v0"
args_cli.load_run = "2024-04-29_15-05-57"
args_cli.logger = "wandb"
run_num = args_cli.run_num

# load cheaper kit config in headless
if args_cli.headless:
    app_experience = f"{os.environ['EXP_PATH']}/omni.isaac.sim.python.gym.headless.kit"
else:
    app_experience = f"{os.environ['EXP_PATH']}/omni.isaac.sim.python.kit"

# launch omniverse app
app_launcher = AppLauncher(args_cli, experience=app_experience)
simulation_app = app_launcher.app

"""Rest everything follows."""

import gymnasium as gym
import os
import torch
import traceback
from datetime import datetime

import carb
from rsl_rl.runners import OnPolicyRunner, MBPOOnPolicyRunner

from omni.isaac.orbit.envs import RLTaskEnvCfg
from omni.isaac.orbit.utils.dict import print_dict
from omni.isaac.orbit.utils.io import dump_pickle, dump_yaml

import omni.isaac.contrib_tasks  # noqa: F401
import omni.isaac.orbit_tasks  # noqa: F401
from omni.isaac.orbit_tasks.utils import get_checkpoint_path, parse_env_cfg
from omni.isaac.orbit_tasks.utils.wrappers.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper

from sweep_config import sweep_config, update_config_from_sweep
import wandb
import time

SWEEP_ID_FILE = "logs/rsl_rl/anymal_d_flat/sweep_id.txt"

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False

def train_sweep():
    # parse configuration
    env_cfg: RLTaskEnvCfg = parse_env_cfg(args_cli.task, use_gpu=not args_cli.cpu, num_envs=args_cli.num_envs)
    agent_cfg: RslRlOnPolicyRunnerCfg = cli_args.parse_rsl_rl_cfg(args_cli.task, args_cli)
    wandb.init(project="orbit", entity=os.environ["WANDB_USERNAME"])
    env_cfg, agent_cfg = update_config_from_sweep(env_cfg, agent_cfg, wandb.config)
    run_experiment(env_cfg, agent_cfg)

def run_sweep():
    if run_num == 0:
        sweep_id = wandb.sweep(sweep_config, project="orbit")
        # Save the sweep ID to a shared location
        with open(SWEEP_ID_FILE, "w") as f:
            f.write(sweep_id)
    else:
        # Wait for the sweep ID file to be available
        print("[Wandb] Waiting for sweep ID file")
        while not os.path.exists(SWEEP_ID_FILE):
            time.sleep(1)  # Wait until the file exists
        with open(SWEEP_ID_FILE, "r") as f:
            sweep_id = f.read().strip()
        wandb.agent(sweep_id, function=train_sweep, project="orbit", count=1)

def run_experiment(env_cfg, agent_cfg):
    """Finetune with RSL-RL agent."""

    # specify directory for logging experiments
    log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
    log_root_path = os.path.abspath(log_root_path)
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
    # specify directory for logging runs: {time-stamp}_{run_name}
    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    if agent_cfg.run_name:
        log_dir += f"_{agent_cfg.run_name}"
    log_dir = os.path.join(log_root_path, log_dir)

    # create isaac environment
    env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
    # wrap for video recording
    if args_cli.video:
        video_kwargs = {
            "video_folder": os.path.join(log_dir, "videos"),
            "step_trigger": lambda step: step % args_cli.video_interval == 0,
            "video_length": args_cli.video_length,
            "disable_logger": True,
        }
        print("[INFO] Recording videos during finetuning.")
        print_dict(video_kwargs, nesting=4)
        env = gym.wrappers.RecordVideo(env, **video_kwargs)
    # wrap around environment for rsl-rl
    env = RslRlVecEnvWrapper(env)

    # create runner from rsl-rl
    runner = MBPOOnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
    # write git state to logs
    runner.add_git_repo_to_log(__file__)
    # save resume path before creating a new log_dir
    if agent_cfg.resume:
        # get path to previous checkpoint
        resume_path = get_checkpoint_path(log_root_path, agent_cfg.load_run, agent_cfg.load_checkpoint)
        print(f"[INFO]: Loading model checkpoint from: {resume_path}")
        # load previously trained model
        runner.load(resume_path, load_optimizer=False, load_system_dynamics=False)

    # set seed of the environment
    env.seed(agent_cfg.seed)

    # dump the configuration into log-directory
    dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg)
    dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg)
    dump_pickle(os.path.join(log_dir, "params", "env.pkl"), env_cfg)
    dump_pickle(os.path.join(log_dir, "params", "agent.pkl"), agent_cfg)

    # run finetuning
    runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)

    # close the simulator
    env.close()

if __name__ == "__main__":
    try:
        # run the main execution
        run_sweep()
    except Exception as err:
        carb.log_error(err)
        carb.log_error(traceback.format_exc())
        raise
    finally:
        # close sim app
        simulation_app.close()

System Info

Describe the characteristic of your environment:

Commit: [b1be6bb]
Isaac Sim Version: [2023.1.1]
OS: [e.g. Ubuntu 20.04]
GPU: [e.g. RTX 4090]
CUDA: [e.g. 12.2]
GPU Driver: [535.183.01]

Additional context

Mayank has a explanation and a solution: the correct order of calling should be 1. init sweep agent, 2. create simulation app, 3. reset/update simulation.

Checklist

[x] I have checked that there is no similar issue in the repo (required)
[x] I have checked that the issue is not in running Isaac Sim itself and is related to the repo

Acceptance Criteria

Add the criteria for which this task is considered done. If not known at issue creation time, you can add this once the issue is assigned.

[ ] Criteria 1
[ ] Criteria 2

AlessioMosca commented 1 month ago

Hi @ breadli428, I have met the same problem, Did you solve it? If so, may I ask how?

Best regards Alessio

Mayankm96 commented 1 month ago

Current solution is to move the creation of the simulation app inside the train_sweep function. All the imports need to be adjusted accordingly.

AlessioMosca commented 1 month ago

@Mayankm96 Many thanks, may I ask which import should I edit? Should I edit the imports inside the "sweep" file or the imports used inside the IsaacLab such as source/extensions/omni.isaac.lab/omni/isaac/lab/utils/array.py?

Best regards Alessio

AlessioMosca commented 1 month ago

Hi @Mayankm96 , I have tried to modify my code according to your suggestion. However, I am not able to use wandb. Here is my code:

# Copyright (c) 2022-2024, The Isaac Lab Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Script to train RL agent with RSL-RL."""

"""Launch Isaac Sim Simulator first."""

import argparse

from omni.isaac.lab.app import AppLauncher

# local imports
import cli_args  # isort: skip
import os

# add argparse arguments
parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
parser.add_argument("--cpu", action="store_true", default=False, help="Use CPU pipeline.")
parser.add_argument(
    "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
)
parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
parser.add_argument("--task", type=str, default=None, help="Name of the task.")
parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
parser.add_argument("--max_iterations", type=int, default=None, help="RL Policy training iterations.")

parser.add_argument("--run_num", type=int, default=None, help="Run number for the experiment on the cluster.")

# append RSL-RL cli arguments
cli_args.add_rsl_rl_args(parser)
# append AppLauncher cli args
AppLauncher.add_app_launcher_args(parser)
args_cli = parser.parse_args()

# # overwrite args for cluster training
# args_cli.headless = True
# args_cli.task = "grace-rough-train"
# # args_cli.load_run = "2024-04-29_15-05-57"
# args_cli.logger = "wandb"
run_num = args_cli.run_num

# load cheaper kit config in headless
if args_cli.headless:
    app_experience = f"{os.environ['EXP_PATH']}/omni.isaac.sim.python.gym.headless.kit"
else:
    app_experience = f"{os.environ['EXP_PATH']}/omni.isaac.sim.python.kit"

# always enable cameras to record video
if args_cli.video:
    args_cli.enable_cameras = True

# import sys
# path_old = set(sys.path)
# app_launcher = AppLauncher(args_cli)
# # simulation_app = app_launcher.app
# path = set(sys.path)

import gymnasium as gym
import os
import torch
from datetime import datetime

import carb
from rsl_rl.runners import OnPolicyRunner

# from omni.isaac.lab.envs import ManagerBasedRLEnvCfg
from omni.isaac.lab.utils.dict import print_dict
from omni.isaac.lab.utils.io import dump_pickle, dump_yaml

# import omni.isaac.lab_tasks  # noqa: F401
# from omni.isaac.lab_tasks.utils import get_checkpoint_path, parse_env_cfg
# from omni.isaac.lab_tasks.utils.wrappers.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False

import wandb
# from sweep_config import sweep_config, update_config_from_sweep DA CAPIRE COME FARE

import traceback
import time

SWEEP_ID_FILE = "logs/rsl_rl/grace_rough/sweep_id.txt"

sweep_config = {
    "method": "bayes",
    "name": "grace_isaaclab",
    "metric": {"goal": "maximize", "name": "reward"},
    "parameters": {
        "stiffness": {"max": 80, "min": 30},
        "damping": {"max": 7., "min": 0.5},
    },
}

def update_config_from_sweep(env_cfg, agent_cfg, wandb_config):
    # env_cfg.rewards.action_rate_l2.weight =
    env_cfg.scene.robot.actuators["j1"].stiffness['.*'] = wandb_config.stiffness
    return (env_cfg, agent_cfg)

# env_cfg.rewards.action_rate_l2.weight
# env_cfg.scene.robot.actuators["j1"].stiffness['.*']
import sys
def train_sweep():
    # parse configuration
    app_launcher = AppLauncher(args_cli)
    simulation_app = app_launcher.app

    from omni.isaac.lab.envs import ManagerBasedRLEnvCfg
    import omni.isaac.lab_tasks  # noqa: F401
    from omni.isaac.lab_tasks.utils import get_checkpoint_path, parse_env_cfg
    from omni.isaac.lab_tasks.utils.wrappers.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper

    env_cfg: ManagerBasedRLEnvCfg = parse_env_cfg(
        args_cli.task, use_gpu=not args_cli.cpu, num_envs=args_cli.num_envs, use_fabric=not args_cli.disable_fabric
    )
    agent_cfg: RslRlOnPolicyRunnerCfg = cli_args.parse_rsl_rl_cfg(args_cli.task, args_cli)

    wandb.init(project="grace_isaaclab") #, entity=os.environ["WANDB_USERNAME"]) config=sweep_config

    env_cfg, agent_cfg = update_config_from_sweep(env_cfg, agent_cfg, wandb.config)
    run_experiment(env_cfg, agent_cfg)
    simulation_app.close()

def run_sweep():
    if run_num == 0:
        sweep_id = wandb.sweep(sweep_config, project="grace_isaaclab")
        # Save the sweep ID to a shared location
        with open(SWEEP_ID_FILE, "w") as f:
            f.write(sweep_id)
    else:
        # Wait for the sweep ID file to be available
        print("[Wandb] Waiting for sweep ID file")
        while not os.path.exists(SWEEP_ID_FILE):
            time.sleep(1)  # Wait until the file exists
        with open(SWEEP_ID_FILE, "r") as f:
            sweep_id = f.read().strip()
        wandb.agent(sweep_id, function=train_sweep, project="grace_isaaclab", count=1)

def run_experiment(env_cfg, agent_cfg):
    """Train with RSL-RL agent."""
    # parse configuration
    # env_cfg: ManagerBasedRLEnvCfg = parse_env_cfg(
    #     args_cli.task, use_gpu=not args_cli.cpu, num_envs=args_cli.num_envs, use_fabric=not args_cli.disable_fabric
    # )
    # agent_cfg: RslRlOnPolicyRunnerCfg = cli_args.parse_rsl_rl_cfg(args_cli.task, args_cli)

    # specify directory for logging experiments
    log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
    log_root_path = os.path.abspath(log_root_path)
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
    # specify directory for logging runs: {time-stamp}_{run_name}
    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    if agent_cfg.run_name:
        log_dir += f"_{agent_cfg.run_name}"
    log_dir = os.path.join(log_root_path, log_dir)

    # max iterations for training
    if args_cli.max_iterations:
        agent_cfg.max_iterations = args_cli.max_iterations

    # create isaac environment
    env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
    # wrap for video recording
    if args_cli.video:
        video_kwargs = {
            "video_folder": os.path.join(log_dir, "videos"),
            "step_trigger": lambda step: step % args_cli.video_interval == 0,
            "video_length": args_cli.video_length,
            "disable_logger": True,
        }
        print("[INFO] Recording videos during training.")
        print_dict(video_kwargs, nesting=4)
        env = gym.wrappers.RecordVideo(env, **video_kwargs)
    # wrap around environment for rsl-rl
    env = RslRlVecEnvWrapper(env)

    # create runner from rsl-rl
    runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
    # write git state to logs
    runner.add_git_repo_to_log(__file__)
    # save resume path before creating a new log_dir
    if agent_cfg.resume:
        # get path to previous checkpoint
        resume_path = get_checkpoint_path(log_root_path, agent_cfg.load_run, agent_cfg.load_checkpoint)
        print(f"[INFO]: Loading model checkpoint from: {resume_path}")
        # load previously trained model
        runner.load(resume_path)

    # set seed of the environment
    env.seed(agent_cfg.seed)

    # dump the configuration into log-directory
    dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg)
    dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg)
    dump_pickle(os.path.join(log_dir, "params", "env.pkl"), env_cfg)
    dump_pickle(os.path.join(log_dir, "params", "agent.pkl"), agent_cfg)

    # run training
    runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)

    # close the simulator
    env.close()

if __name__ == "__main__":
    try:
        # run the main execution
        run_sweep()
    except Exception as err:
        carb.log_error(err)
        carb.log_error(traceback.format_exc())
        raise
    finally:
        print("fine")
        # close sim app
        # simulation_app.close()

The error that I met is the following:

Traceback (most recent call last):
  File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 307, in _run_job
    self._function()
  File "/home/lab/IsaacLab/source/standalone/workflows/rsl_rl/train_sweep.py", line 116, in train_sweep
    app_launcher = AppLauncher(args_cli)
  File "/home/lab/IsaacLab/source/extensions/omni.isaac.lab/omni/isaac/lab/app/app_launcher.py", line 113, in __init__
    self._create_app()
  File "/home/lab/IsaacLab/source/extensions/omni.isaac.lab/omni/isaac/lab/app/app_launcher.py", line 532, in _create_app
    self._app = SimulationApp(self._sim_app_config, experience=self._sim_experience_file)
  File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/site-packages/isaacsim/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py", line 208, in __init__
    signal.signal(signal.SIGINT, signal_handler)
  File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/signal.py", line 56, in signal
    handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
ValueError: signal only works in main thread of the main interpreter

wandb: ERROR Run 6jiqul36 errored:
wandb: ERROR Traceback (most recent call last):
wandb: ERROR   File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 307, in _run_job
wandb: ERROR     self._function()
wandb: ERROR   File "/home/lab/IsaacLab/source/standalone/workflows/rsl_rl/train_sweep.py", line 116, in train_sweep
wandb: ERROR     app_launcher = AppLauncher(args_cli)
wandb: ERROR   File "/home/lab/IsaacLab/source/extensions/omni.isaac.lab/omni/isaac/lab/app/app_launcher.py", line 113, in __init__
wandb: ERROR     self._create_app()
wandb: ERROR   File "/home/lab/IsaacLab/source/extensions/omni.isaac.lab/omni/isaac/lab/app/app_launcher.py", line 532, in _create_app
wandb: ERROR     self._app = SimulationApp(self._sim_app_config, experience=self._sim_experience_file)
wandb: ERROR   File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/site-packages/isaacsim/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py", line 208, in __init__
wandb: ERROR     signal.signal(signal.SIGINT, signal_handler)
wandb: ERROR   File "/home/lab/anaconda3/envs/isaaclab/lib/python3.10/signal.py", line 56, in signal
wandb: ERROR     handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
wandb: ERROR ValueError: signal only works in main thread of the main interpreter
wandb: ERROR

May I ask If you have any suggestions?

Best Regards Alessio

AlessioMosca commented 2 weeks ago

Hi @Mayankm96 ,

I attempted to resolve the issue with signal handling by commenting out lines 200 to 209 in the file:

/home/<USER>/anaconda3/envs/isaaclab/lib/python3.10/site-packages/isaacsim/exts/omni.isaac.kit/omni/isaac/kit/simulation_app.py

Here is the code section I commented out:

def signal_handler(signal, frame):
    # disable logging warnings as we are going to terminate the process
    _logging = carb.logging.acquire_logging()
    _logging.set_level_threshold(carb.logging.LEVEL_FATAL)
    self._framework.unload_all_plugins()
    sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)

By doing this, I was able to start the first sweep with Weights & Biases (wandb). However, after the first simulation, when the run_experiment() function ends, the call to simulation_app.close() causes the entire program to exit.

The function called by the wandb agent is as follows:

def train_sweep():

    app_launcher = AppLauncher(args_cli)
    simulation_app = app_launcher.app

    from omni.isaac.lab.envs import ManagerBasedRLEnvCfg
    from omni.isaac.lab.utils.dict import print_dict
    from omni.isaac.lab.utils.io import dump_pickle, dump_yaml

    import omni.isaac.lab_tasks  # noqa: F401
    from omni.isaac.lab_tasks.utils import get_checkpoint_path, parse_env_cfg
    from omni.isaac.lab_tasks.utils.wrappers.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper
    #

    env_cfg: ManagerBasedRLEnvCfg = parse_env_cfg(
        args_cli.task, use_gpu=not args_cli.cpu, num_envs=args_cli.num_envs, use_fabric=not args_cli.disable_fabric
    )
    agent_cfg: RslRlOnPolicyRunnerCfg = cli_args.parse_rsl_rl_cfg(args_cli.task, args_cli)

    wandb.init(project=args_cli.log_project_name, config=sweep_config) #, entity=os.environ["WANDB_USERNAME"]) config=sweep_config

    env_cfg, agent_cfg = update_config_from_sweep(env_cfg, agent_cfg, wandb.config)

    run_experiment(env_cfg, agent_cfg)

    simulation_app.close()

I believe that the reason the entire program exits is the call to self._framework.unload_all_plugins() at line 546 in the aforementioned file (where the close() method is defined).

The output printed in the terminal is:

[102.334s] Simulation App Shutting Down

Process finished with exit code 0

It seems that the agent is killed whenever simulation_app.close() is called.

May I ask if you have any solution?

Best Regards Alessio

isaac-sim / IsaacLab