openxla / xla

A machine learning compiler for GPUs, CPUs, and ML accelerators
Apache License 2.0
2.58k stars 401 forks source link

Failed to compile pjrt c plugin on GPU with nvcc and clang #16206

Open Kovax007 opened 3 weeks ago

Kovax007 commented 3 weeks ago

Hi!

I am having trouble compiling the pjrt c plugin on Archlinux.

Here is the configuration command I am using: python ./configure.py --backend CUDA --os LINUX --host_compiler CLANG --cuda_compiler NVCC --cudnn_version=9.2.1 --cuda_compute_capabilities=8.9

And the build command: bazelisk build -c opt //xla/pjrt/c:pjrt_c_api_gpu_plugin.so

error message:

ERROR: /home/kovax/Asztal/dev/xla/xla/service/gpu/kernels/BUILD:169:19: Compiling xla/service/gpu/kernels/topk_kernel_float.cu.cc failed: (Exit 2): crosstool_wrapper_driver_is_not_gcc failed: error executing command (from target //xla/service/gpu/kernels:topk_kernel_gpu) external/local_config_cuda/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc -MD -MF bazel-out/k8-opt/bin/xla/service/gpu/kernels/_objs/topk_kernel_gpu/topk_kernel_float.cu.pic.d ... (remaining 206 arguments skipped)
/home/kovax/.cache/bazel/_bazel_kovax/7785f7e4ef7a95fb991affd3bab6cb62/execroot/xla/external/local_config_cuda/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc:225: SyntaxWarning: invalid escape sequence '\.'
re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
/usr/include/bits/stdlib.h(37): error: linkage specification is incompatible with previous "realpath" (declared at line 940 of /usr/include/stdlib.h)
realpath (const char *__restrict __name, char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __resolved) noexcept(true)
^

/usr/include/bits/stdlib.h(72): error: linkage specification is incompatible with previous "ptsname_r" (declared at line 1134 of /usr/include/stdlib.h)
ptsname_r (int __fd, char * const __attribute__ ((__pass_object_size__ (1 > 1))) __buf, size_t __buflen) noexcept (true)
^

/usr/include/bits/stdlib.h(91): error: linkage specification is incompatible with previous "wctomb" (declared at line 1069 of /usr/include/stdlib.h)
wctomb (char * const __attribute__ ((__pass_object_size__ (1 > 1))) __s, wchar_t __wchar) noexcept (true)
^

/usr/include/bits/stdlib.h(129): error: linkage specification is incompatible with previous "mbstowcs" (declared at line 1073 of /usr/include/stdlib.h)
mbstowcs (wchar_t * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dst, const char *__restrict __src, size_t __len) noexcept (true)
^

/usr/include/bits/stdlib.h(159): error: linkage specification is incompatible with previous "wcstombs" (declared at line 1077 of /usr/include/stdlib.h)
wcstombs (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dst, const wchar_t *__restrict __src, size_t __len) noexcept (true)
^

/usr/include/bits/string_fortified.h(77): error: linkage specification is incompatible with previous "strcpy" (declared at line 141 of /usr/include/string.h)
strcpy (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src) noexcept (true)
^

/usr/include/bits/string_fortified.h(86): error: linkage specification is incompatible with previous "stpcpy" (declared at line 491 of /usr/include/string.h)
stpcpy (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src) noexcept (true)
^

/usr/include/bits/string_fortified.h(96): error: linkage specification is incompatible with previous "strncpy" (declared at line 144 of/usr/include/string.h)
strncpy (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src, size_t __len) noexcept (true)
^

/usr/include/bits/string_fortified.h(107): error: linkage specification is incompatible with previous "stpncpy" (declared at line 499 of /usr/include/string.h)
stpncpy (char * const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__src, size_t __n) noexcept (true)
^

/usr/include/bits/string_fortified.h(136): error: linkage specification is incompatible with previous "strcat" (declared at line 149 of/usr/include/string.h)
strcat (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src) noexcept (true)
^

/usr/include/bits/string_fortified.h(145): error: linkage specification is incompatible with previous "strncat" (declared at line 152 of /usr/include/string.h)
strncat (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src, size_t __len) noexcept (true)
^

/usr/include/bits/string_fortified.h(161): error: linkage specification is incompatible with previous "strlcpy" (declared at line 506 of /usr/include/string.h)
strlcpy (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src, size_t __n) noexcept (true)
^

/usr/include/bits/string_fortified.h(179): error: linkage specification is incompatible with previous "strlcat" (declared at line 512 of /usr/include/string.h)
strlcat (char * __restrict const __attribute__ ((__pass_object_size__ (1 > 1))) __dest, const char *__restrict __src, size_t __n) noexcept (true)
^

/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/tuple(2962): error: type name is not allowed
static_assert(!__reference_constructs_from_temporary(_Tp, _Elt));
^

/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/tuple(2962): error: type name is not allowed
static_assert(!__reference_constructs_from_temporary(_Tp, _Elt));
^

/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/tuple(2962): error: identifier "__reference_constructs_from_temporary" is undefined
static_assert(!__reference_constructs_from_temporary(_Tp, _Elt));
^

16 errors detected in the compilation of "xla/service/gpu/kernels/topk_kernel_float.cu.cc".
Target //xla/pjrt/c:pjrt_c_api_gpu_plugin.so failed to build
Use --verbose_failures to see the command lines of failed build steps.
INFO: Elapsed time: 94.222s, Critical Path: 4.89s
INFO: 3451 processes: 2316 internal, 1135 local.
FAILED: Build did NOT complete successfully

And if I change the NVCC compiler to CLANG then I get a different error:

ERROR: /home/kovax/Asztal/dev/xla/xla/service/gpu/kernels/BUILD:169:19: Compiling xla/service/gpu/kernels/topk_kernel_float.cu.cc failed: (Exit 1): clang-18 failed: error executing command (from target //xla/service/gpu/kernels:topk_kernel_gpu) /usr/bin/clang-18 -MD -MFbazel-out/k8-opt/bin/xla/service/gpu/kernels/_objs/topk_kernel_gpu/topk_kernel_float.cu.pic.d ... (remaining 213 arguments skipped)
clang-18: warning: CUDA version is newer than the latest supported version 12.3 [-Wunknown-cuda-version]
In file included from <built-in>:1:
In file included from /usr/lib/clang/18/include/__clang_cuda_runtime_wrapper.h:41:
In file included from /usr/lib/clang/18/include/cuda_wrappers/cmath:27:
In file included from /usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/cmath:3898:
In file included from /usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/bits/specfun.h:44:
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2089:27: error: __float128 is not supported on this target
2089 |     struct numeric_limits<__float128>
|                           ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2093:33: error: __float128 is not supported on this target
2093 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2104:33: error: __float128 is not supported on this target
2104 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2118:33: error: __float128 is not supported on this target
2118 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2132:33: error: __float128 is not supported on this target
2132 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2136:33: error: __float128 is not supported on this target
2136 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2156:33: error: __float128 is not supported on this target
2156 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2160:33: error: __float128 is not supported on this target
2160 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2164:33: error: __float128 is not supported on this target
2164 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2176:33: error: __float128 is not supported on this target
2176 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2199:13: error: __float128 is not supported on this target
2199 |       _S_4p(__float128 __v) _GLIBCXX_USE_NOEXCEPT
|             ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2198:33: error: __float128 is not supported on this target
2198 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2202:33: error: __float128 is not supported on this target
2202 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2206:33: error: __float128 is not supported on this target
2206 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2210:33: error: __float128 is not supported on this target
2210 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2214:33: error: __float128 is not supported on this target
2214 |       static _GLIBCXX_CONSTEXPR __float128
|                                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2207:7: note: '_S_1pm16352' defined here
2207 |       _S_1pm16352() _GLIBCXX_USE_NOEXCEPT
|       ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2110:10: error: __float128 is not supported on this target
2110 |         return (__float128(double(3.4028236692093843e+38))
|                 ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2215:7: note: '_S_1p16256' defined here
2215 |       _S_1p16256() _GLIBCXX_USE_NOEXCEPT
|       ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2170:28: error: __float128 is not supported on this target
2170 |         return __builtin_bit_cast(__float128, __builtin_nansf128(""));
|                                   ^
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/limits:2207:7: note: '_S_1pm16352' defined here
2207 |       _S_1pm16352() _GLIBCXX_USE_NOEXCEPT
|       ^
18 errors generated when compiling for sm_60.
Target //xla/pjrt/c:pjrt_c_api_gpu_plugin.so failed to build
Use --verbose_failures to see the command lines of failed build steps.
INFO: Elapsed time: 38.254s, Critical Path: 4.74s
INFO: 1066 processes: 28 internal, 1038 local.
FAILED: Build did NOT complete successfully
Artem-B commented 3 weeks ago

And if I change the NVCC compiler to CLANG then I get a different error:

The issue with __float128 has been fixed in the recent clang. https://github.com/llvm/llvm-project/pull/83918

Kovax007 commented 2 weeks ago

I managed to make progress and compile the pjrt c plugin. In order to make it work, first I had to compile clang-19 and set the xla_configure.bazelrc to point the linkopt ld path to the builded lld linker.

I had to set the hermetic cuda enviroment to 12.5.1 build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.5.1" (Does not work with 12.6.0) After that it still failed to compile, as protocolbuffer udp.c module had one define that is c++23: upb_alignof my edit:

#define upb_alignof(type) \
((size_t)(&((struct { char c; type d; }*)0)->d))

upb_arena *upb_arena_init(void *mem, size_t n, upb_alloc *alloc) {
  const size_t first_block_overhead = sizeof(upb_arena) + sizeof(mem_block);
  upb_arena *a;
  bool owned = false;

  /* Round block size down to alignof(*a) since we will allocate the arena
   * itself at the end. */
  {
    const size_t align = upb_alignof(upb_arena);
    n = (n / align) * align;  // Round down to nearest multiple of align
  }

  if (n < first_block_overhead) {
    /* We need to malloc the initial block. */
    n = first_block_overhead + 256;
    owned = true;
    if (!alloc || !(mem = upb_malloc(alloc, n))) {
      return NULL;
    }
  }

  a = (upb_arena *)((char*)mem + n - sizeof(*a));
  n -= sizeof(*a);

  a->alloc.func = &upb_arena_doalloc;
  a->block_alloc = &upb_alloc_global;
  a->bytes_allocated = 0;
  a->next_block_size = 256;
  a->max_block_size = 16384;
  a->cleanup_head = NULL;
  a->block_head = NULL;
  a->block_alloc = alloc;

  upb_arena_addblock(a, mem, n, owned);

  return a;
}

#undef upb_alignof

Still failed, but now pointing to the external cutlass library matrix.h where m.set_slice3x3 had to be fixed to m.set_slice_3x3. (missing underscore)

After this above it worked.