ray-project / ray

Ray is an AI compute engine. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
34k stars 5.78k forks source link

Permission denied error when calling ray.init() #3899

Closed robertnishihara closed 5 years ago

robertnishihara commented 5 years ago

Someone ran into the following error today when calling ray.init().

The issue was that Ray is attempting to log to /tmp/ray and the user was on a shared machine in which a different user owned /tmp/ray. the solution was to call

ray.init(temp_dir='/tmp/something_else')

to force ray to log somewhere else (to a directory that the user could create).

---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
<ipython-input-2-3f68a533b944> in <module>
----> 1 ray.init()

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/worker.py in init(redis_address, num_cpus, num_gpus, resources, object_store_memory, redis_max_memory, node_ip_address, object_id_seed, num_workers, local_mode, driver_mode, redirect_worker_output, redirect_output, ignore_reinit_error, num_redis_shards, redis_max_clients, redis_password, plasma_directory, huge_pages, include_webui, driver_id, configure_logging, logging_level, logging_format, plasma_store_socket_name, raylet_socket_name, temp_dir, _internal_config, use_raylet)
   1452         global _global_node
   1453         _global_node = ray.node.Node(
-> 1454             head=True, shutdown_at_exit=False, ray_params=ray_params)
   1455         address_info["redis_address"] = _global_node.redis_address
   1456         address_info[

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/node.py in __init__(self, ray_params, head, shutdown_at_exit)
     84         self._webui_url = None
     85 
---> 86         self.start_ray_processes()
     87 
     88         if shutdown_at_exit:

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/node.py in start_ray_processes(self)
    272         logger.info(
    273             "Process STDOUT and STDERR is being redirected to {}.".format(
--> 274                 get_logs_dir_path()))
    275 
    276         # If this is the head node, start the relevant head node processes.

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/tempfile_services.py in get_logs_dir_path()
    104 def get_logs_dir_path():
    105     """Get a temp dir for logging."""
--> 106     logs_dir = os.path.join(get_temp_root(), "logs")
    107     try_to_create_directory(logs_dir)
    108     return logs_dir

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/tempfile_services.py in get_temp_root()
     92                 pid=os.getpid(), date_str=date_str),
     93             directory_name="/tmp/ray")
---> 94     try_to_create_directory(_temp_root)
     95     return _temp_root
     96 

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/tempfile_services.py in try_to_create_directory(directory_path)
     59         except OSError as e:
     60             if e.errno != os.errno.EEXIST:
---> 61                 raise e
     62             logger.warning(
     63                 "Attempted to create '{}', but the directory already "

/data/nileshtrip/miniconda3/lib/python3.6/site-packages/ray/tempfile_services.py in try_to_create_directory(directory_path)
     56     if not os.path.exists(directory_path):
     57         try:
---> 58             os.makedirs(directory_path)
     59         except OSError as e:
     60             if e.errno != os.errno.EEXIST:

/data/nileshtrip/miniconda3/lib/python3.6/os.py in makedirs(name, mode, exist_ok)
    218             return
    219     try:
--> 220         mkdir(name, mode)
    221     except OSError:
    222         # Cannot rely on checking for EEXIST, since the operating system

PermissionError: [Errno 13] Permission denied: '/tmp/ray/session_2019-01-29_16-18-38_28339'
robertnishihara commented 5 years ago

I posted this in case anyone runs into the same error. Closing because the workaround is stated above.

iglimanaj commented 3 years ago

For the latest version of ray you need to pass the temp_dir argument like the following: ray.init(_temp_dir='/tmp/something_else')

jdwillard19 commented 2 years ago

I get the same error when using ray.init(_temp_dir='./tmp/ray')

more specifically this was executed on host node, ray start --head --port=6379

Then this on worker node ray.init(address='128.55.68.181:6379', _redis_password='5241590000000000',_temp_dir='~tmp/ray')

Same error when using CLI instead of Python ray start --address='128.55.68.181:6379' --redis-password='5241590000000000' --temp-dir='~tmp/ray

stack trace below

jwillard@perlmutter:nid001629:~> ray start --address='128.55.68.181:6379' --redis-password='5241590000000000' --temp-dir='~tmp/ray' Local node IP: 128.55.69.198 Traceback (most recent call last): File "/global/homes/j/jwillard/miniconda3/envs/stml/bin/ray", line 8, in sys.exit(main()) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/scripts/scripts.py", line 1938, in main return cli() File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/click/core.py", line 1128, in call return self.main(args, kwargs) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/click/core.py", line 1053, in main rv = self.invoke(ctx) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/click/core.py", line 1659, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/click/core.py", line 1395, in invoke return ctx.invoke(self.callback, ctx.params) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/click/core.py", line 754, in invoke return __callback(args, *kwargs) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/autoscaler/_private/cli_logger.py", line 808, in wrapper return f(args, **kwargs) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/scripts/scripts.py", line 724, in start node = ray.node.Node( File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/node.py", line 151, in init self._init_temp() File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/node.py", line 280, in _init_temp try_to_create_directory(self._temp_dir) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/site-packages/ray/_private/utils.py", line 835, in try_to_create_directory os.makedirs(directory_path, exist_ok=True) File "/global/homes/j/jwillard/miniconda3/envs/stml/lib/python3.9/os.py", line 225, in makedirs mkdir(name, mode) PermissionError: [Errno 13] Permission denied: '/tmp/ray'

xbinglzh commented 2 years ago

ray.init(address="auto", _temp_dir='/tmp/s1')

also error: $ /opt/python38/bin/python3 ray-demo1.py Traceback (most recent call last): File "ray-demo1.py", line 12, in ray.init(address="auto", _temp_dir='/tmp/s1') File "/opt/python38/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper return func(*args, **kwargs) File "/opt/python38/lib/python3.8/site-packages/ray/worker.py", line 1067, in init _global_node = ray.node.Node( File "/opt/python38/lib/python3.8/site-packages/ray/node.py", line 243, in init self.metrics_agent_port = self._get_cached_port( File "/opt/python38/lib/python3.8/site-packages/ray/node.py", line 800, in _get_cached_port with FileLock(file_path + ".lock"): File "/opt/python38/lib/python3.8/site-packages/filelock/_api.py", line 210, in enter self.acquire() File "/opt/python38/lib/python3.8/site-packages/filelock/_api.py", line 166, in acquire self._acquire() File "/opt/python38/lib/python3.8/site-packages/filelock/_unix.py", line 29, in _acquire fd = os.open(self._lock_file, open_mode) PermissionError: [Errno 13] Permission denied: '/tmp/ray/session_2022-07-14_14-48-23_952236_116205/ports_by_node.json.lock'

ARDivekar commented 2 years ago

This issue should be reopened, I am also getting this issue with ray 1.13.0.

The param _temp_dir is ignored when calling ray.init() from an Amazon EC2 machine.

Setup:

Error: PermissionError: [Errno 13] Permission denied: '/tmp/ray/session_2022-08-20_09-00-51_347113_37451/ports_by_node.json.lock'

Clearly this is ignoring _temp_dir

ARDivekar commented 2 years ago

I can get around this by using chmod 777 -R /tmp/ray/* but I feel like I really should not have to.

ARDivekar commented 2 years ago

Pinging @robertnishihara , hopefully this issue can be reopened since users are still experiencing it?

vincentliuheyang commented 2 years ago

I can get around this by using chmod 777 -R /tmp/ray/* but I feel like I really should not have to.

I got the same issue. The same script is going to raise permission error saying the files under ray folder does not have permission, while other deployed scipts work just fine.

robertnishihara commented 2 years ago

@ARDivekar can you try with Ray 2.0? I just tried locally (on a Macbook) but wasn't able to reproduce the issue.

Break00 commented 2 years ago

@ARDivekar can you try with Ray 2.0? I just tried locally (on a Macbook) but wasn't able to reproduce the issue.

I got the same issue with Ray 2.0.0

Code ray.init(address='auto')

Error

File /opt/tljh/user/lib/python3.9/site-packages/ray/_private/client_mode_hook.py:105, in client_mode_hook..wrapper(*args, kwargs) 103 if func.name != "init" or is_client_mode_enabled_by_default: 104 return getattr(ray, func.name)(*args, *kwargs) --> 105 return func(args, kwargs)

File /opt/tljh/user/lib/python3.9/site-packages/ray/_private/worker.py:1475, in init(address, num_cpus, num_gpus, resources, object_store_memory, local_mode, ignore_reinit_error, include_dashboard, dashboard_host, dashboard_port, job_config, configure_logging, logging_level, logging_format, log_to_driver, namespace, runtime_env, storage, **kwargs) 1462 ray_params = ray._private.parameter.RayParams( 1463 node_ip_address=node_ip_address, 1464 raylet_ip_address=raylet_ip_address, (...) 1472 metrics_export_port=_metrics_export_port, 1473 ) 1474 try: -> 1475 _global_node = ray._private.node.Node( 1476 ray_params, 1477 head=False, 1478 shutdown_at_exit=False, 1479 spawn_reaper=False, 1480 connect_only=True, 1481 ) 1482 except ConnectionError: 1483 if gcs_address == ray._private.utils.read_ray_address(_temp_dir):

File /opt/tljh/user/lib/python3.9/site-packages/ray/_private/node.py:244, in Node.init(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only) 237 self._plasma_store_socket_name = self._prepare_socket_file( 238 self._ray_params.plasma_store_socket_name, default_prefix="plasma_store" 239 ) 240 self._raylet_socket_name = self._prepare_socket_file( 241 self._ray_params.raylet_socket_name, default_prefix="raylet" 242 ) --> 244 self.metrics_agent_port = self._get_cached_port( 245 "metrics_agent_port", default_port=ray_params.metrics_agent_port 246 ) 247 self._metrics_export_port = self._get_cached_port( 248 "metrics_export_port", default_port=ray_params.metrics_export_port 249 ) 251 ray_params.update_if_absent( 252 metrics_agent_port=self.metrics_agent_port, 253 metrics_export_port=self._metrics_export_port, 254 )

File /opt/tljh/user/lib/python3.9/site-packages/ray/_private/node.py:801, in Node._get_cached_port(self, port_name, default_port) 798 # Maps a Node.unique_id to a dict that maps port names to port numbers. 799 ports_by_node: Dict[str, Dict[str, int]] = defaultdict(dict) --> 801 with FileLock(file_path + ".lock"): 802 if not os.path.exists(file_path): 803 with open(file_path, "w") as f:

File /opt/tljh/user/lib/python3.9/site-packages/filelock/_api.py:220, in BaseFileLock.enter(self) 214 def enter(self) -> BaseFileLock: 215 """ 216 Acquire the lock. 217 218 :return: the lock object 219 """ --> 220 self.acquire() 221 return self

File /opt/tljh/user/lib/python3.9/site-packages/filelock/_api.py:173, in BaseFileLock.acquire(self, timeout, poll_interval, poll_intervall, blocking) 171 if not self.is_locked: 172 _LOGGER.debug("Attempting to acquire lock %s on %s", lock_id, lock_filename) --> 173 self._acquire() 175 if self.is_locked: 176 _LOGGER.debug("Lock %s acquired on %s", lock_id, lock_filename)

File /opt/tljh/user/lib/python3.9/site-packages/filelock/_unix.py:35, in UnixFileLock._acquire(self) 33 def _acquire(self) -> None: 34 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC ---> 35 fd = os.open(self._lock_file, open_mode) 36 try: 37 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)

PermissionError: [Errno 13] Permission denied: '/tmp/ray/session_2022-10-17_23-56-04_168517_260622/ports_by_node.json.lock'

JalinWang commented 1 year ago

@robertnishihara hi~ Is there any way to specify tmp_dir for worker node by CLI? The doc says:

--temp-dir manually specify the root temporary dir of the Ray process, only works when –head is specified

I already specify it for the head node however the worker node will omit it.

  ray start --head --node-ip-address="$head_node_ip" \
      --port=$port \
      --include-dashboard=true --dashboard-port=$dashboard_port --dashboard-host=0.0.0.0  \
      --ray-client-server-port=$client_server_port \
      --temp-dir="/tmp/ray_tmp" 

ray start --address "$ip_head" \
        --temp-dir="/tmp/ray_tmp"  # OMITTED!!!  I got `PermissionError: [Errno 13] Permission denied: '/tmp/ray/ray_current_cluster'`

By default, the permission of ray_current_cluster is private to the creator. image