dask / dask-gateway

A multi-tenant server for securely deploying and managing Dask clusters.
https://gateway.dask.org/
BSD 3-Clause "New" or "Revised" License
135 stars 87 forks source link

SlurmClusterConfig should have option for shebang #832

Open pandalanax opened 3 weeks ago

pandalanax commented 3 weeks ago

Describe the issue: I try to spawn a cluster via dask-gateway with the following config:

# Configure the gateway to use slurm
c.Proxy.address = '0.0.0.0:8000'
c.Proxy.tcp_address = '0.0.0.0:8001'

c.DaskGateway.backend_class = (
            "dask_gateway_server.backends.jobqueue.slurm.SlurmBackend"
            )
#c.SlurmClusterConfig.scheduler_cmd = "~/miniconda/bin/dask-scheduler"
#c.SlurmClusterConfig.worker_cmd = "~/miniconda/bin/dask-worker"
c.SlurmClusterConfig.partition = "heavy"
c.SlurmClusterConfig.staging_directory = '/tmp/.dask-gateway/'

from dask_gateway_server.options import Options, String

c.JobQueueClusterConfig.scheduler_setup = 'source /opt/dask/MyEnv/bin/activate'
c.JobQueueClusterConfig.worker_setup = 'source /opt/dask/MyEnv/bin/activate'
c.SlurmClusterConfig.scheduler_cmd = "/opt/dask/MyEnv/bin/dask-scheduler"
c.SlurmClusterConfig.worker_cmd = "/opt/dask/MyEnv/bin/dask-worker"
#resource limits for a worker
c.JobQueueClusterConfig.worker_memory = '100G'
c.JobQueueClusterConfig.worker_cores = 16

Slurm accepts the submission and tries to execute job. which fails with

---------------------------------------------------------------------------
GatewayClusterError                       Traceback (most recent call last)
Cell In[176], line 1
----> 1 cluster = gateway.new_cluster(options)

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:641, in Gateway.new_cluster(self, cluster_options, shutdown_on_close, **kwargs)
    618 def new_cluster(self, cluster_options=None, shutdown_on_close=True, **kwargs):
    619     """Submit a new cluster to the gateway, and wait for it to be started.
    620 
    621     Same as calling ``submit`` and ``connect`` in one go.
   (...)
    639     cluster : GatewayCluster
    640     """
--> 641     return GatewayCluster(
    642         address=self.address,
    643         proxy_address=self.proxy_address,
    644         public_address=self._public_address,
    645         auth=self.auth,
    646         asynchronous=self.asynchronous,
    647         loop=self.loop,
    648         cluster_options=cluster_options,
    649         shutdown_on_close=shutdown_on_close,
    650         **kwargs,
    651     )

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:816, in GatewayCluster.__init__(self, address, proxy_address, public_address, auth, cluster_options, shutdown_on_close, asynchronous, loop, **kwargs)
    804 def __init__(
    805     self,
    806     address=None,
   (...)
    814     **kwargs,
    815 ):
--> 816     self._init_internal(
    817         address=address,
    818         proxy_address=proxy_address,
    819         public_address=public_address,
    820         auth=auth,
    821         cluster_options=cluster_options,
    822         cluster_kwargs=kwargs,
    823         shutdown_on_close=shutdown_on_close,
    824         asynchronous=asynchronous,
    825         loop=loop,
    826     )

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:921, in GatewayCluster._init_internal(self, address, proxy_address, public_address, auth, cluster_options, cluster_kwargs, shutdown_on_close, asynchronous, loop, name)
    919     self.status = "starting"
    920 if not self.asynchronous:
--> 921     self.gateway.sync(self._start_internal)

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:344, in Gateway.sync(self, func, *args, **kwargs)
    340 future = asyncio.run_coroutine_threadsafe(
    341     func(*args, **kwargs), self.loop.asyncio_loop
    342 )
    343 try:
--> 344     return future.result()
    345 except BaseException:
    346     future.cancel()

File /opt/conda/lib/python3.10/concurrent/futures/_base.py:458, in Future.result(self, timeout)
    456     raise CancelledError()
    457 elif self._state == FINISHED:
--> 458     return self.__get_result()
    459 else:
    460     raise TimeoutError()

File /opt/conda/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
    401 if self._exception:
    402     try:
--> 403         raise self._exception
    404     finally:
    405         # Break a reference cycle with the exception in self._exception
    406         self = None

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:935, in GatewayCluster._start_internal(self)
    933     self._start_task = asyncio.ensure_future(self._start_async())
    934 try:
--> 935     await self._start_task
    936 except BaseException:
    937     # On exception, cleanup
    938     await self._stop_internal()

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:953, in GatewayCluster._start_async(self)
    951 # Connect to cluster
    952 try:
--> 953     report = await self.gateway._wait_for_start(self.name)
    954 except GatewayClusterError:
    955     raise

File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:581, in Gateway._wait_for_start(self, cluster_name)
    579     return report
    580 elif report.status is ClusterStatus.FAILED:
--> 581     raise GatewayClusterError(
    582         "Cluster %r failed to start, see logs for "
    583         "more information" % cluster_name
    584     )
    585 elif report.status is ClusterStatus.STOPPED:
    586     raise GatewayClusterError(
    587         "Cluster %r is already stopped" % cluster_name
    588     )

GatewayClusterError: Cluster '904e0c326cd5484cacf9d343bd1c5226' failed to start, see logs for more information

inside the logs from that submission is:

/opt/slurm/slurmd/job00068/slurm_script: 3: source: not found
/opt/slurm/slurmd/job00068/slurm_script: 5: dask-scheduler: not found

and if i check how the script put together by the gateway with

$ scontrol write batch_script 68
$ cat slurm-68.sh

#!/bin/sh
source /opt/dask/MyEnv/bin/activate
/opt/dask/MyEnv/bin/dask-scheduler --protocol tls --port 0 --host 0.0.0.0 --dashboard-address 0.0.0.0:0 --preload dask_gateway.scheduler_preload --dg-api-address 0.0.0.0:0 --dg-heartbeat-period 15 --dg-adaptive-period 3.0 --dg-idle-timeout 0.0

/bin/sh does not know the command source and me changing it to . results in Unrecognized Shell which i think conda is not happy with sh. So the Example on https://gateway.dask.org/install-jobqueue.html is not working for me.

Minimal Complete Verifiable Example:

from dask_gateway import Gateway,BasicAuth
auth = BasicAuth(username="dask")

gateway = Gateway("http://IP.OF.OUR.GATEWAY:8000",auth=auth)

cluster = gateway.new_cluster(options)

Anything else we need to know?: Suggested change is to expose a config for the slurm script shebang, like you are already doing for dask_jobqueue.SLURMCluster

Affected lines are: https://github.com/dask/dask-gateway/blob/bf04d65def1c6d6ca758d14b849735e2368789c8/dask-gateway-server/dask_gateway_server/backends/jobqueue/slurm.py#L72 https://github.com/dask/dask-gateway/blob/bf04d65def1c6d6ca758d14b849735e2368789c8/dask-gateway-server/dask_gateway_server/backends/jobqueue/slurm.py#L84

Environment: