Open jacobtomlinson opened 3 months ago
When following the multi-node instructions I'm seeing an error.
Reproducer
#!/bin/bash set -e # The Databricks Python directory isn't on the path in # databricksruntime/gpu-tensorflow:cuda11.8 for some reason export PATH="/databricks/python/bin:$PATH" # Install RAPIDS (cudf & dask-cudf) and dask-databricks /databricks/python/bin/pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \ "cudf-cu12>=24.4.0a0,<=24.4" "dask-cudf-cu12>=24.4.0a0,<=24.4" \ "cuml-cu12>=24.4.0a0,<=24.4" "cugraph-cu12>=24.4.0a0,<=24.4" \ "cuspatial-cu12>=24.4.0a0,<=24.4" "cuproj-cu12>=24.4.0a0,<=24.4" \ "cuxfilter-cu12>=24.4.0a0,<=24.4" "cucim-cu12>=24.4.0a0,<=24.4" \ "pylibraft-cu12>=24.4.0a0,<=24.4" "raft-dask-cu12>=24.4.0a0,<=24.4" \ "dask-cuda>=24.4.0a0,<=24.4" \ dask[complete] \ dask-databricks # Start the Dask cluster with CUDA workers dask databricks run --cuda
Choose the 14.2 (Scala 2.12, Spark 3.5.0) runtime
14.2 (Scala 2.12, Spark 3.5.0)
Chose the databricksruntime/gpu-pytorch:cuda11.8 container image
databricksruntime/gpu-pytorch:cuda11.8
Run the example task cudf code
RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory --------------------------------------------------------------------------- OSError Traceback (most recent call last) File cupy_backends/cuda/_softlink.pyx:25, in cupy_backends.cuda._softlink.SoftLink.__init__() File /usr/lib/python3.10/ctypes/__init__.py:374, in CDLL.__init__(self, name, mode, handle, use_errno, use_last_error, winmode) 373 if handle is None: --> 374 self._handle = _dlopen(self._name, mode) 375 else: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory The above exception was the direct cause of the following exception: RuntimeError Traceback (most recent call last) File <command-2671937974741064>, line 6 2 import dask 5 df = dask.datasets.timeseries().map_partitions(cudf.from_pandas) ----> 6 df.x.mean().compute() File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs) 113 @wraps(func) 114 def inner(*args, **kwargs): 115 libnvtx_push_range(self.attributes, self.domain.handle) --> 116 result = func(*args, **kwargs) 117 libnvtx_pop_range(self.domain.handle) 118 return result File /databricks/python/lib/python3.10/site-packages/dask_cudf/core.py:367, in Series.mean(self, split_every) 365 @_dask_cudf_nvtx_annotate 366 def mean(self, split_every=False): --> 367 sum = self.sum(split_every=split_every) 368 n = self.count(split_every=split_every) 369 return sum / n File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:256, in _dummy_numpy_dispatcher.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 253 if kwargs.pop(name, None) is not None: 254 raise ValueError(f"the '{name}' keyword is not supported") --> 256 return func(*args, **kwargs) File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2369, in _Frame.sum(self, axis, skipna, split_every, dtype, out, min_count, numeric_only) 2357 @_dummy_numpy_dispatcher("dtype", "out", deprecated=True) 2358 @derived_from(pd.DataFrame) 2359 def sum( (...) 2367 numeric_only=None, 2368 ): -> 2369 result = self._reduction_agg( 2370 "sum", 2371 axis=axis, 2372 skipna=skipna, 2373 split_every=split_every, 2374 out=out, 2375 numeric_only=numeric_only, 2376 ) 2377 if min_count: 2378 cond = self.notnull().sum(axis=axis) >= min_count File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2286, in _Frame._reduction_agg(self, name, axis, skipna, split_every, out, numeric_only, none_is_zero) 2274 def _reduction_agg( 2275 self, 2276 name, (...) 2282 none_is_zero=True, 2283 ): 2284 axis = self._validate_axis(axis, none_is_zero=none_is_zero) -> 2286 if has_keyword(getattr(self._meta_nonempty, name), "numeric_only"): 2287 numeric_only_kwargs = {"numeric_only": numeric_only} 2288 else: File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:635, in _Frame._meta_nonempty(self) 632 @property 633 def _meta_nonempty(self): 634 """A non-empty version of `_meta` with fake data.""" --> 635 return meta_nonempty(self._meta) File /databricks/python/lib/python3.10/site-packages/dask/utils.py:767, in Dispatch.__call__(self, arg, *args, **kwargs) 763 """ 764 Call the corresponding method based on type of argument. 765 """ 766 meth = self.dispatch(type(arg)) --> 767 return meth(arg, *args, **kwargs) File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs) 113 @wraps(func) 114 def inner(*args, **kwargs): 115 libnvtx_push_range(self.attributes, self.domain.handle) --> 116 result = func(*args, **kwargs) 117 libnvtx_pop_range(self.domain.handle) 118 return result File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:153, in _nonempty_series(s, idx) 151 if idx is None: 152 idx = _nonempty_index(s.index) --> 153 data = _get_non_empty_data(s._column) 155 return cudf.Series(data, name=s.name, index=idx) File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs) 113 @wraps(func) 114 def inner(*args, **kwargs): 115 libnvtx_push_range(self.attributes, self.domain.handle) --> 116 result = func(*args, **kwargs) 117 libnvtx_pop_range(self.domain.handle) 118 return result File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:139, in _get_non_empty_data(s) 136 else: 137 if pd.api.types.is_numeric_dtype(s.dtype): 138 data = cudf.core.column.as_column( --> 139 cp.arange(start=0, stop=2, dtype=s.dtype) 140 ) 141 else: 142 data = cudf.core.column.as_column( 143 cp.arange(start=0, stop=2, dtype="int64") 144 ).astype(s.dtype) File /databricks/python/lib/python3.10/site-packages/cupy/_creation/ranges.py:60, in arange(start, stop, step, dtype) 58 ret = cupy.empty((size,), dtype=dtype) 59 typ = numpy.dtype(dtype).type ---> 60 _arange_ufunc(typ(start), typ(step), ret, dtype=dtype) 61 return ret File cupy/_core/_kernel.pyx:1375, in cupy._core._kernel.ufunc.__call__() File cupy/_core/_kernel.pyx:1402, in cupy._core._kernel.ufunc._get_ufunc_kernel() File cupy/_core/_kernel.pyx:1082, in cupy._core._kernel._get_ufunc_kernel() File cupy/_core/_kernel.pyx:94, in cupy._core._kernel._get_simple_elementwise_kernel() File cupy/_core/_kernel.pyx:82, in cupy._core._kernel._get_simple_elementwise_kernel_from_code() File cupy/_core/core.pyx:2254, in cupy._core.core.compile_with_cache() File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:484, in _compile_module_with_cache(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, jitify) 480 return _compile_with_cache_hip( 481 source, options, arch, cache_dir, extra_source, backend, 482 name_expressions, log_stream, cache_in_memory) 483 else: --> 484 return _compile_with_cache_cuda( 485 source, options, arch, cache_dir, extra_source, backend, 486 enable_cooperative_groups, name_expressions, log_stream, 487 cache_in_memory, jitify) File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:499, in _compile_with_cache_cuda(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, cache_in_memory, jitify) 497 cache_dir = get_cache_dir() 498 if arch is None: --> 499 arch = _get_arch() 501 options += ('-ftz=true',) 503 if enable_cooperative_groups: 504 # `cooperative_groups` requires relocatable device code. File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret() File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:148, in _get_arch() 144 @_util.memoize(for_each_device=True) 145 def _get_arch(): 146 # See Supported Compile Options section of NVRTC User Guide for 147 # the maximum value allowed for `--gpu-architecture`. --> 148 nvrtc_max_compute_capability = _get_max_compute_capability() 150 arch = device.Device().compute_capability 151 if arch in _tegra_archs: File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret() File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:126, in _get_max_compute_capability() 124 @_util.memoize() 125 def _get_max_compute_capability(): --> 126 major, minor = _get_nvrtc_version() 127 if major < 11: 128 # CUDA 10.2 129 nvrtc_max_compute_capability = '75' File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:115, in _get_nvrtc_version() 113 global _nvrtc_version 114 if _nvrtc_version is None: --> 115 _nvrtc_version = nvrtc.getVersion() 117 return _nvrtc_version File cupy_backends/cuda/libs/nvrtc.pyx:56, in cupy_backends.cuda.libs.nvrtc.getVersion() File cupy_backends/cuda/libs/nvrtc.pyx:57, in cupy_backends.cuda.libs.nvrtc.getVersion() File cupy_backends/cuda/libs/_cnvrtc.pxi:72, in cupy_backends.cuda.libs.nvrtc.initialize() File cupy_backends/cuda/libs/_cnvrtc.pxi:76, in cupy_backends.cuda.libs.nvrtc._initialize() File cupy_backends/cuda/libs/_cnvrtc.pxi:143, in cupy_backends.cuda.libs.nvrtc._get_softlink() File cupy_backends/cuda/_softlink.pyx:32, in cupy_backends.cuda._softlink.SoftLink.__init__() RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory
Same thing happens with the databricksruntime/gpu-tensorflow:cuda11.8 image.
databricksruntime/gpu-tensorflow:cuda11.8
When following the multi-node instructions I'm seeing an error.
Reproducer
Choose the
14.2 (Scala 2.12, Spark 3.5.0)
runtimeChose the
databricksruntime/gpu-pytorch:cuda11.8
container imageRun the example task cudf code