Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
i am new to Ray and want to launch a Hyperparametertuning on our SLURM cluster. However i run into the following error and i don't know how to fix it! Would be super nice if you could help me, since i cannot resume my work otherwise
srun: job 50448585 queued and waiting for resources
srun: job 50448585 has been allocated resources
IP Head: 10.205.179.189:6379
STARTING HEAD at
srun: job 50448588 queued and waiting for resources
srun: job 50448588 has been allocated resources
Traceback (most recent call last):
File "/cluster/home/malmansto/IVF_final/Code_Marius/code/test4.py", line 80, in
main(True)
File "/cluster/home/malmansto/IVF_final/Code_Marius/code/test4.py", line 42, in main
ray.init(address="auto")
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/worker.py", line 1537, in init
bootstrap_address = services.canonicalize_bootstrap_address(address, _temp_dir)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/services.py", line 530, in canonicalize_bootstrap_address
addr = get_ray_address_from_environment(addr, temp_dir)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/services.py", line 423, in get_ray_address_from_environment
raise ConnectionError(
ConnectionError: Could not find any running Ray instance. Please specify the one to connect to by setting --address flag or RAY_ADDRESS environment variable.
(base) [user@eu-login-04 code]$ 2024-03-07 10:11:30,722 INFO usage_lib.py:449 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add --disable-usage-stats to the command that starts the cluster, or run the following command: ray disable-usage-stats before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
2024-03-07 10:11:30,722 INFO scripts.py:744 -- Local node IP: 10.205.179.189
Traceback (most recent call last):
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/bin/ray", line 8, in
sys.exit(main())
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2498, in main
return cli()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1128, in call
return self.main(args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1659, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, ctx.params)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 754, in invoke
return __callback(args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/autoscaler/_private/cli_logger.py", line 856, in wrapper
return f(*args, **kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/scripts/scripts.py", line 771, in start
node = ray._private.node.Node(
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 306, in init
self.start_head_processes()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 1310, in start_head_processes
assert self.get_gcs_client() is not None
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 677, in get_gcs_client
self._init_gcs_client()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 722, in _init_gcs_client
raise RuntimeError(
RuntimeError: Failed to start GCS. Last 0 lines of error files:
Versions / Dependencies
ray 2.9.3
python 3.10.3
Reproduction script
from __future__ import annotations
import os
from typing import Any
import click
import optuna
import ray
from ray import tune
from ray.air import CheckpointConfig, RunConfig
from ray.air.integrations.wandb import WandbLoggerCallback
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from ray.util.joblib import register_ray
class Trainable(tune.Trainable):
def setup(self, config):
self.a = config["a"]
self.b = config["b"]
def step(self):
# Minimize a + b
return dict(score=self.a + self.b)
def save_checkpoint(self, checkpoint_dir):
return checkpoint_dir
def load_checkpoint(self, checkpoint_path):
pass
def suggest_config(trial: optuna.Trial) -> dict[str, Any]:
a = trial.suggest_float("a", -10, 10)
b = trial.suggest_float("b", -10, 10)
return {"a": a, "b": b}
def main(do_tune=False):
study_name = "ray-tune-minimize-a-plus-b"
ray.init(address="auto")
register_ray()
run_config = RunConfig(
callbacks=[
WandbLoggerCallback(api_key_file="~/.wandb_api_key", project=study_name),
],
sync_config=ray.train.SyncConfig(),
stop={"training_iteration": 5},
checkpoint_config=CheckpointConfig(checkpoint_at_end=True),
name=study_name,
)
if do_tune:
optuna_search = OptunaSearch(
suggest_config,
metric="score",
mode="min",
)
tuner = tune.Tuner(
Trainable,
tune_config=tune.TuneConfig(
scheduler=ASHAScheduler(metric="score", mode="min"),
num_samples=ray.cluster_resources()['CPU'],
search_alg=optuna_search,
),
run_config=run_config,
)
tuner.fit()
else:
tuner = tune.Tuner(
Trainable,
param_space={"a": 0, "b": 0},
run_config=run_config,
)
tuner.fit()
main(True)
Batch script:
#!/bin/bash
# shellcheck disable=SC2206
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE RUNNABLE!
#SBATCH --job-name=Hyperparametertuning_0306-2128
#SBATCH --output=Hyperparametertuning_0306-2128.log
### This script works for any number of nodes, Ray will find and manage all resources
#SBATCH --nodes=2
#SBATCH --exclusive
# nodelist=$(awk '{print $1}' nodes.txt | paste -sd, -)
# #SBATCH --nodelist=$nodelist
### Give all resources to a single Ray task, ray can manage the resources internally
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=0
# Load modules or your own conda environment here
# module load pytorch/v1.4.0-gpu
. ~/miniconda/etc/profile.d/conda.sh
conda activate IVF_conda
# ===== DO NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING =====
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
nodes_array=($nodes)
node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address
# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<< "$ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
ip=${ADDR[1]}
else
ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $ip"
fi
port=6379
ip_head=$ip:$port
export ip_head
echo "IP Head: $ip_head"
ray_executable_path='/cluster/home/malmansto/miniconda/envs/IVF_conda/bin/ray'
echo "STARTING HEAD at $node_1"
srun --nodes=1 --ntasks=1 -w "$node_1" \
bash -c "source ~/miniconda/etc/profile.d/conda.sh; conda activate IVF_conda; $ray_executable_path start --head --node-ip-address=\"$ip\" --port=$port --block" &
sleep 30
worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "STARTING WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w "$node_i" bash -c "source ~/miniconda/etc/profile.d/conda.sh; conda activate IVF_conda; $ray_executable_path start --address \"$ip_head\" --block" &
sleep 5
done
# ===== Call your code below =====
~/miniconda/envs/IVF_conda/bin/python test4.py
What happened + What you expected to happen
Hello,
i am new to Ray and want to launch a Hyperparametertuning on our SLURM cluster. However i run into the following error and i don't know how to fix it! Would be super nice if you could help me, since i cannot resume my work otherwise
srun: job 50448585 queued and waiting for resources srun: job 50448585 has been allocated resources IP Head: 10.205.179.189:6379 STARTING HEAD at srun: job 50448588 queued and waiting for resources srun: job 50448588 has been allocated resources Traceback (most recent call last): File "/cluster/home/malmansto/IVF_final/Code_Marius/code/test4.py", line 80, in
main(True)
File "/cluster/home/malmansto/IVF_final/Code_Marius/code/test4.py", line 42, in main
ray.init(address="auto")
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/worker.py", line 1537, in init
bootstrap_address = services.canonicalize_bootstrap_address(address, _temp_dir)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/services.py", line 530, in canonicalize_bootstrap_address
addr = get_ray_address_from_environment(addr, temp_dir)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/services.py", line 423, in get_ray_address_from_environment
raise ConnectionError(
ConnectionError: Could not find any running Ray instance. Please specify the one to connect to by setting
sys.exit(main())
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2498, in main
return cli()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1128, in call
return self.main(args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1659, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, ctx.params)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/click/core.py", line 754, in invoke
return __callback(args, kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/autoscaler/_private/cli_logger.py", line 856, in wrapper
return f(*args, **kwargs)
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/scripts/scripts.py", line 771, in start
node = ray._private.node.Node(
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 306, in init
self.start_head_processes()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 1310, in start_head_processes
assert self.get_gcs_client() is not None
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 677, in get_gcs_client
self._init_gcs_client()
File "/cluster/home/malmansto/miniconda/envs/IVF_conda/lib/python3.10/site-packages/ray/_private/node.py", line 722, in _init_gcs_client
raise RuntimeError(
RuntimeError: Failed to start GCS. Last 0 lines of error files:
--address
flag orRAY_ADDRESS
environment variable. (base) [user@eu-login-04 code]$ 2024-03-07 10:11:30,722 INFO usage_lib.py:449 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add--disable-usage-stats
to the command that starts the cluster, or run the following command:ray disable-usage-stats
before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. 2024-03-07 10:11:30,722 INFO scripts.py:744 -- Local node IP: 10.205.179.189 Traceback (most recent call last): File "/cluster/home/malmansto/miniconda/envs/IVF_conda/bin/ray", line 8, inVersions / Dependencies
ray 2.9.3
python 3.10.3
Reproduction script
Batch script:
Issue Severity
High: It blocks me from completing my task.