xsdk-project / xsdk-issues

A repository under which GitHub issues not related to a specific xSDK repo can be filed.
7 stars 0 forks source link

sundials build issue on perlmutter with `sundials +magma+cuda` #243

Closed balos1 closed 8 months ago

balos1 commented 9 months ago
Related issue on Perlmutter with `sundials +cuda`.
It cannot find `cublas` or `cusparse` (needed for `magma`), which on Perlmutter are installed in a separate folder from the core cuda libraries.
[spack-build-out.txt](https://github.com/xsdk-project/xsdk-issues/files/12921772/spack-build-out.txt)

Originally posted by @pghysels in https://github.com/xsdk-project/xsdk-issues/issues/239#issuecomment-1765262032

balos1 commented 9 months ago

Here is a patch that fixes the build with sundials+magma+cuda on Perlmutter.

diff --git a/cmake/tpl/FindMAGMA.cmake b/cmake/tpl/FindMAGMA.cmake
index d244c504b..da6c384b2 100644
--- a/cmake/tpl/FindMAGMA.cmake
+++ b/cmake/tpl/FindMAGMA.cmake
@@ -72,7 +72,6 @@ if(MAGMA_LIBRARY AND MAGMA_INCLUDE_DIR)

         # Remove -l only from the beginning of the string
         string(REPLACE "^-l" "" lib ${lib})
-        list(APPEND _interface_libraires ${lib})

         # Check if we need to find roc::hipblas or roc::hipsparse
         if(SUNDIALS_MAGMA_BACKENDS MATCHES "HIP")
@@ -83,6 +82,16 @@ if(MAGMA_LIBRARY AND MAGMA_INCLUDE_DIR)
             find_package(hipsparse REQUIRED)
           endif()
         endif()
+        
+        # Check if we need to find cusparse or cublas
+        if(SUNDIALS_MAGMA_BACKENDS MATCHES "CUDA")
+          if (NOT TARGET CUDA::cublas)
+            find_package(CUDAToolkit)
+          endif()
+          continue()
+        endif()
+        
+        list(APPEND _interface_libraires ${lib})

       endif()
     endforeach()
diff --git a/src/sundials/sundials_cuda.h b/src/sundials/sundials_cuda.h
index 244e6e76e..75c0749da 100644
--- a/src/sundials/sundials_cuda.h
+++ b/src/sundials/sundials_cuda.h
@@ -21,8 +21,6 @@
 #include <stdio.h>

 #include <cuda_runtime.h>
-#include <cusolverSp.h>
-#include <cusparse.h>

 #include <sundials/sundials_types.h>

@@ -38,8 +36,6 @@ extern "C" {
  * ---------------------------------------------------------------------------*/

 #define SUNDIALS_CUDA_VERIFY(cuerr) SUNDIALS_CUDA_Assert(cuerr, __FILE__, __LINE__)
-#define SUNDIALS_CUSPARSE_VERIFY(cuerr) SUNDIALS_CUSPARSE_Assert(cuerr, __FILE__, __LINE__)
-#define SUNDIALS_CUSOLVER_VERIFY(cuerr) SUNDIALS_CUSOLVER_Assert(cuerr, __FILE__, __LINE__)

 #define SUNDIALS_KERNEL_NAME(...) __VA_ARGS__
 #ifndef SUNDIALS_DEBUG_CUDA_LASTERROR
@@ -75,42 +71,9 @@ inline booleantype SUNDIALS_CUDA_Assert(cudaError_t cuerr, const char *file, int
   return SUNTRUE; /* Assert OK */
 }

-inline booleantype SUNDIALS_CUSPARSE_Assert(cusparseStatus_t status, const char *file, int line)
-{
-  if (status != CUSPARSE_STATUS_SUCCESS)
-  {
-#ifdef SUNDIALS_DEBUG
-    fprintf(stderr,
-            "ERROR in cuSPARSE runtime operation: cusparseStatus_t = %d %s:%d\n",
-            status, file, line);
-#ifdef SUNDIALS_DEBUG_ASSERT
-    assert(false);
-#endif
-#endif
-    return SUNFALSE; /*  Assert failed */
-  }
-  return SUNTRUE; /* Assert OK */
-}
-
-inline booleantype SUNDIALS_CUSOLVER_Assert(cusolverStatus_t status, const char *file, int line)
-{
-  if (status != CUSOLVER_STATUS_SUCCESS)
-  {
-#ifdef SUNDIALS_DEBUG
-    fprintf(stderr,
-            "ERROR in cuSOLVER runtime operation: cusolverStatus_t = %d %s:%d\n",
-            status, file, line);
-#ifdef SUNDIALS_DEBUG_ASSERT
-    assert(false);
-#endif
-#endif
-    return SUNFALSE; /*  Assert failed */
-  }
-  return SUNTRUE; /* Assert OK */
-}

 #ifdef __cplusplus  /* wrapper to enable C++ usage */
 }
 #endif

-#endif /* _SUNDIALS_CUDA_H */
\ No newline at end of file
+#endif /* _SUNDIALS_CUDA_H */
diff --git a/src/sundials/sundials_cusolver.h b/src/sundials/sundials_cusolver.h
new file mode 100644
index 000000000..b1aee9b26
--- /dev/null
+++ b/src/sundials/sundials_cusolver.h
@@ -0,0 +1,68 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * This header files defines internal utility functions and macros
+ * for working with CUDA.
+ * -----------------------------------------------------------------
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+
+#include <sundials/sundials_types.h>
+
+#ifndef _SUNDIALS_CUSOLVER_H
+#define _SUNDIALS_CUSOLVER_H
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+extern "C" {
+#endif
+
+/* ---------------------------------------------------------------------------
+ * Utility macros
+ * ---------------------------------------------------------------------------*/
+
+#define SUNDIALS_CUSOLVER_VERIFY(cuerr) SUNDIALS_CUSOLVER_Assert(cuerr, __FILE__, __LINE__)
+
+
+/* ---------------------------------------------------------------------------
+ * Utility functions
+ * ---------------------------------------------------------------------------*/
+
+inline booleantype SUNDIALS_CUSOLVER_Assert(cusolverStatus_t status, const char *file, int line)
+{
+  if (status != CUSOLVER_STATUS_SUCCESS)
+  {
+#ifdef SUNDIALS_DEBUG
+    fprintf(stderr,
+            "ERROR in cuSOLVER runtime operation: cusolverStatus_t = %d %s:%d\n",
+            status, file, line);
+#ifdef SUNDIALS_DEBUG_ASSERT
+    assert(false);
+#endif
+#endif
+    return SUNFALSE; /*  Assert failed */
+  }
+  return SUNTRUE; /* Assert OK */
+}
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+}
+#endif
+
+#endif /* _SUNDIALS_CUSOLVER_H */
diff --git a/src/sundials/sundials_cusparse.h b/src/sundials/sundials_cusparse.h
new file mode 100644
index 000000000..3f9c3ae49
--- /dev/null
+++ b/src/sundials/sundials_cusparse.h
@@ -0,0 +1,66 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2023, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * This header files defines internal utility functions and macros
+ * for working with CUDA.
+ * -----------------------------------------------------------------
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include <sundials/sundials_types.h>
+
+#ifndef _SUNDIALS_CUSPARSE_H
+#define _SUNDIALS_CUSPARSE_H
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+extern "C" {
+#endif
+
+/* ---------------------------------------------------------------------------
+ * Utility macros
+ * ---------------------------------------------------------------------------*/
+
+#define SUNDIALS_CUSPARSE_VERIFY(cuerr) SUNDIALS_CUSPARSE_Assert(cuerr, __FILE__, __LINE__)
+
+/* ---------------------------------------------------------------------------
+ * Utility functions
+ * ---------------------------------------------------------------------------*/
+
+inline booleantype SUNDIALS_CUSPARSE_Assert(cusparseStatus_t status, const char *file, int line)
+{
+  if (status != CUSPARSE_STATUS_SUCCESS)
+  {
+#ifdef SUNDIALS_DEBUG
+    fprintf(stderr,
+            "ERROR in cuSPARSE runtime operation: cusparseStatus_t = %d %s:%d\n",
+            status, file, line);
+#ifdef SUNDIALS_DEBUG_ASSERT
+    assert(false);
+#endif
+#endif
+    return SUNFALSE; /*  Assert failed */
+  }
+  return SUNTRUE; /* Assert OK */
+}
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+}
+#endif
+
+#endif /* _SUNDIALS_CUSPARSE_H */
diff --git a/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu b/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
index d7a351f97..915f6163e 100644
--- a/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
+++ b/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
@@ -23,6 +23,7 @@
 #include <sunlinsol/sunlinsol_cusolversp_batchqr.h>

 #include "sundials_cuda.h"
+#include "sundials_cusolver.h"
 #include "sundials_debug.h"

 #define ZERO RCONST(0.0)
diff --git a/src/sunlinsol/magmadense/CMakeLists.txt b/src/sunlinsol/magmadense/CMakeLists.txt
index 116dc2d42..e7a658d7a 100644
--- a/src/sunlinsol/magmadense/CMakeLists.txt
+++ b/src/sunlinsol/magmadense/CMakeLists.txt
@@ -16,10 +16,10 @@ install(CODE "MESSAGE(\"\nInstall SUNLINSOL_MAGMADENSE\n\")")

 if(SUNDIALS_MAGMA_BACKENDS MATCHES "CUDA")
   set_source_files_properties(sunlinsol_magmadense.cpp PROPERTIES LANGUAGE CUDA)
-  set(_libs_needed sundials_sunmatrixmagmadense sundials_nveccuda)
+  set(_libs_needed sundials_sunmatrixmagmadense sundials_nveccuda CUDA::cublas CUDA::cusolver)
 elseif(SUNDIALS_MAGMA_BACKENDS MATCHES "HIP")
   set_source_files_properties(sunlinsol_magmadense.cpp PROPERTIES LANGUAGE CXX)
-  set(_libs_needed sundials_sunmatrixmagmadense sundials_nvechip)
+  set(_libs_needed sundials_sunmatrixmagmadense sundials_nvechip hip::device)
 endif()

 # Add the sunlinsol_magmadense library
diff --git a/src/sunmatrix/cusparse/CMakeLists.txt b/src/sunmatrix/cusparse/CMakeLists.txt
index 21ee8a88f..2efa8a8f7 100644
--- a/src/sunmatrix/cusparse/CMakeLists.txt
+++ b/src/sunmatrix/cusparse/CMakeLists.txt
@@ -28,8 +28,7 @@ sundials_add_library(sundials_sunmatrixcusparse
     sundials_generic_obj
     sundials_sunmemcuda_obj
   LINK_LIBRARIES
-    PUBLIC CUDA::cusparse
-    PRIVATE CUDA::cusolver
+    PUBLIC CUDA::cusparse CUDA::cusolver
   OUTPUT_NAME
     sundials_sunmatrixcusparse
   VERSION
diff --git a/src/sunmatrix/cusparse/sunmatrix_cusparse.cu b/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
index a986fe704..70fbc47da 100644
--- a/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
+++ b/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
@@ -24,6 +24,7 @@
 #include <sunmatrix/sunmatrix_cusparse.h>

 #include "sundials_cuda.h"
+#include "sundials_cusparse.h"
 #include "sundials_debug.h"
 #include "cusparse_kernels.cuh"

diff --git a/src/sunmatrix/magmadense/CMakeLists.txt b/src/sunmatrix/magmadense/CMakeLists.txt
index 40612e4ae..5e6b6464c 100644
--- a/src/sunmatrix/magmadense/CMakeLists.txt
+++ b/src/sunmatrix/magmadense/CMakeLists.txt
@@ -16,7 +16,7 @@ install(CODE "MESSAGE(\"\nInstall SUNMATRIX_MAGMADENSE with ${SUNDIALS_MAGMA_BAC

 if(SUNDIALS_MAGMA_BACKENDS MATCHES "CUDA")
   set_source_files_properties(sunmatrix_magmadense.cpp PROPERTIES LANGUAGE CUDA)
-  set(_libs_needed sundials_nveccuda ${CUDA_CUBLAS_LIBRARIES})
+  set(_libs_needed sundials_nveccuda CUDA::cublas)
 elseif(SUNDIALS_MAGMA_BACKENDS MATCHES "HIP")
   set_source_files_properties(sunmatrix_magmadense.cpp PROPERTIES LANGUAGE CXX)
   set(_libs_needed sundials_nvechip hip::device)
balos1 commented 9 months ago

Part of the issue was that MAGMA's pc file, which we consume in sundials, is incorrect:

prefix=/global/u1/b/balos1/Workspaces/xsdk/spack-xsdk/opt/spack/linux-sles15-zen3/gcc-11.2.0/magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include

Name: magma
Description: Matrix Algebra on GPU and Multicore Architectures
Version: 2.7.2
Cflags: -I${includedir}  -std=c++11 -fopenmp -Wall -Wno-unused-function -I/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.7/include -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-build-qa2xz2p/include -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/include -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/control -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/magmablas -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/sparse/include -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/sparse/control -I/tmp/balos1/spack-stage/spack-stage-magma-2.7.2-qa2xz2psv5eiazd2rkzfaqfa5jcwbcaz/spack-src/testing
Libs: -L${libdir} -lmagma_sparse -lmagma  /opt/cray/pe/libsci/23.02.1.1/GNU/9.1/x86_64/lib/libsci_gnu_mpi.so /opt/cray/pe/libsci/23.02.1.1/GNU/9.1/x86_64/lib/libsci_gnu.so -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.7/lib64 -lcudart -lcublas -lcusparse
Libs.private:
Requires:
Requires.private:

The -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.7/lib64 -lcudart -lcublas -lcusparse is not the right path on Perlmutter. One of the things the above patch does is ignore the-lcudart -lcublas -lcusparse from the magma libraries line and then adds them back in using the CMake targets for them @pghysels @luszczek @stomov .

luszczek commented 9 months ago

The path on Perlmutter /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.7/lib64 works to properly build and install MAGMA libraries and executables. Is this Perlmutter-specific issue with CUDA housed in two locations?

pghysels commented 9 months ago

cublas and cusparse are at /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64. This path is also used during MAGMA build. It's found by find_package(CUDAToolkit) and is part of the CUDA::cublas target. But it doesn't appear in MAGMA's pkgconfig file.

balos1 commented 8 months ago

This is solved now via the sundials release/6.6.2 branch which we will officially release shortly.