rapidsai / deployment

RAPIDS Deployment Documentation
https://docs.rapids.ai/deployment/stable/
9 stars 27 forks source link

https://docs.rapids.ai/deployment/nightly/platforms/databricks/ #361

Open jacobtomlinson opened 3 months ago

jacobtomlinson commented 3 months ago

When following the multi-node instructions I'm seeing an error.

Reproducer

  1. Create an init script with the following content
#!/bin/bash
set -e

# The Databricks Python directory isn't on the path in
# databricksruntime/gpu-tensorflow:cuda11.8 for some reason
export PATH="/databricks/python/bin:$PATH"

# Install RAPIDS (cudf & dask-cudf) and dask-databricks
/databricks/python/bin/pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
      "cudf-cu12>=24.4.0a0,<=24.4" "dask-cudf-cu12>=24.4.0a0,<=24.4" \
      "cuml-cu12>=24.4.0a0,<=24.4" "cugraph-cu12>=24.4.0a0,<=24.4" \
      "cuspatial-cu12>=24.4.0a0,<=24.4" "cuproj-cu12>=24.4.0a0,<=24.4" \
      "cuxfilter-cu12>=24.4.0a0,<=24.4" "cucim-cu12>=24.4.0a0,<=24.4" \
      "pylibraft-cu12>=24.4.0a0,<=24.4" "raft-dask-cu12>=24.4.0a0,<=24.4" \
      "dask-cuda>=24.4.0a0,<=24.4" \
      dask[complete] \
      dask-databricks

# Start the Dask cluster with CUDA workers
dask databricks run --cuda
  1. Choose the 14.2 (Scala 2.12, Spark 3.5.0) runtime

  2. Chose the databricksruntime/gpu-pytorch:cuda11.8 container image

  3. Run the example task cudf code

RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
File cupy_backends/cuda/_softlink.pyx:25, in cupy_backends.cuda._softlink.SoftLink.__init__()

File /usr/lib/python3.10/ctypes/__init__.py:374, in CDLL.__init__(self, name, mode, handle, use_errno, use_last_error, winmode)
    373 if handle is None:
--> 374     self._handle = _dlopen(self._name, mode)
    375 else:

OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
File <command-2671937974741064>, line 6
      2 import dask
      5 df = dask.datasets.timeseries().map_partitions(cudf.from_pandas)
----> 6 df.x.mean().compute()

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/core.py:367, in Series.mean(self, split_every)
    365 @_dask_cudf_nvtx_annotate
    366 def mean(self, split_every=False):
--> 367     sum = self.sum(split_every=split_every)
    368     n = self.count(split_every=split_every)
    369     return sum / n

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:256, in _dummy_numpy_dispatcher.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    253         if kwargs.pop(name, None) is not None:
    254             raise ValueError(f"the '{name}' keyword is not supported")
--> 256 return func(*args, **kwargs)

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2369, in _Frame.sum(self, axis, skipna, split_every, dtype, out, min_count, numeric_only)
   2357 @_dummy_numpy_dispatcher("dtype", "out", deprecated=True)
   2358 @derived_from(pd.DataFrame)
   2359 def sum(
   (...)
   2367     numeric_only=None,
   2368 ):
-> 2369     result = self._reduction_agg(
   2370         "sum",
   2371         axis=axis,
   2372         skipna=skipna,
   2373         split_every=split_every,
   2374         out=out,
   2375         numeric_only=numeric_only,
   2376     )
   2377     if min_count:
   2378         cond = self.notnull().sum(axis=axis) >= min_count

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2286, in _Frame._reduction_agg(self, name, axis, skipna, split_every, out, numeric_only, none_is_zero)
   2274 def _reduction_agg(
   2275     self,
   2276     name,
   (...)
   2282     none_is_zero=True,
   2283 ):
   2284     axis = self._validate_axis(axis, none_is_zero=none_is_zero)
-> 2286     if has_keyword(getattr(self._meta_nonempty, name), "numeric_only"):
   2287         numeric_only_kwargs = {"numeric_only": numeric_only}
   2288     else:

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:635, in _Frame._meta_nonempty(self)
    632 @property
    633 def _meta_nonempty(self):
    634     """A non-empty version of `_meta` with fake data."""
--> 635     return meta_nonempty(self._meta)

File /databricks/python/lib/python3.10/site-packages/dask/utils.py:767, in Dispatch.__call__(self, arg, *args, **kwargs)
    763 """
    764 Call the corresponding method based on type of argument.
    765 """
    766 meth = self.dispatch(type(arg))
--> 767 return meth(arg, *args, **kwargs)

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:153, in _nonempty_series(s, idx)
    151 if idx is None:
    152     idx = _nonempty_index(s.index)
--> 153 data = _get_non_empty_data(s._column)
    155 return cudf.Series(data, name=s.name, index=idx)

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:139, in _get_non_empty_data(s)
    136 else:
    137     if pd.api.types.is_numeric_dtype(s.dtype):
    138         data = cudf.core.column.as_column(
--> 139             cp.arange(start=0, stop=2, dtype=s.dtype)
    140         )
    141     else:
    142         data = cudf.core.column.as_column(
    143             cp.arange(start=0, stop=2, dtype="int64")
    144         ).astype(s.dtype)

File /databricks/python/lib/python3.10/site-packages/cupy/_creation/ranges.py:60, in arange(start, stop, step, dtype)
     58 ret = cupy.empty((size,), dtype=dtype)
     59 typ = numpy.dtype(dtype).type
---> 60 _arange_ufunc(typ(start), typ(step), ret, dtype=dtype)
     61 return ret

File cupy/_core/_kernel.pyx:1375, in cupy._core._kernel.ufunc.__call__()

File cupy/_core/_kernel.pyx:1402, in cupy._core._kernel.ufunc._get_ufunc_kernel()

File cupy/_core/_kernel.pyx:1082, in cupy._core._kernel._get_ufunc_kernel()

File cupy/_core/_kernel.pyx:94, in cupy._core._kernel._get_simple_elementwise_kernel()

File cupy/_core/_kernel.pyx:82, in cupy._core._kernel._get_simple_elementwise_kernel_from_code()

File cupy/_core/core.pyx:2254, in cupy._core.core.compile_with_cache()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:484, in _compile_module_with_cache(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, jitify)
    480     return _compile_with_cache_hip(
    481         source, options, arch, cache_dir, extra_source, backend,
    482         name_expressions, log_stream, cache_in_memory)
    483 else:
--> 484     return _compile_with_cache_cuda(
    485         source, options, arch, cache_dir, extra_source, backend,
    486         enable_cooperative_groups, name_expressions, log_stream,
    487         cache_in_memory, jitify)

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:499, in _compile_with_cache_cuda(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, cache_in_memory, jitify)
    497     cache_dir = get_cache_dir()
    498 if arch is None:
--> 499     arch = _get_arch()
    501 options += ('-ftz=true',)
    503 if enable_cooperative_groups:
    504     # `cooperative_groups` requires relocatable device code.

File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:148, in _get_arch()
    144 @_util.memoize(for_each_device=True)
    145 def _get_arch():
    146     # See Supported Compile Options section of NVRTC User Guide for
    147     # the maximum value allowed for `--gpu-architecture`.
--> 148     nvrtc_max_compute_capability = _get_max_compute_capability()
    150     arch = device.Device().compute_capability
    151     if arch in _tegra_archs:

File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:126, in _get_max_compute_capability()
    124 @_util.memoize()
    125 def _get_max_compute_capability():
--> 126     major, minor = _get_nvrtc_version()
    127     if major < 11:
    128         # CUDA 10.2
    129         nvrtc_max_compute_capability = '75'

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:115, in _get_nvrtc_version()
    113 global _nvrtc_version
    114 if _nvrtc_version is None:
--> 115     _nvrtc_version = nvrtc.getVersion()
    117 return _nvrtc_version

File cupy_backends/cuda/libs/nvrtc.pyx:56, in cupy_backends.cuda.libs.nvrtc.getVersion()

File cupy_backends/cuda/libs/nvrtc.pyx:57, in cupy_backends.cuda.libs.nvrtc.getVersion()

File cupy_backends/cuda/libs/_cnvrtc.pxi:72, in cupy_backends.cuda.libs.nvrtc.initialize()

File cupy_backends/cuda/libs/_cnvrtc.pxi:76, in cupy_backends.cuda.libs.nvrtc._initialize()

File cupy_backends/cuda/libs/_cnvrtc.pxi:143, in cupy_backends.cuda.libs.nvrtc._get_softlink()

File cupy_backends/cuda/_softlink.pyx:32, in cupy_backends.cuda._softlink.SoftLink.__init__()

RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory
jacobtomlinson commented 3 months ago

Same thing happens with the databricksruntime/gpu-tensorflow:cuda11.8 image.