linux aarch64 support (NVIDIA GH200)

BlairSadewitz commented 4 months ago

did this to get it to build; obviously I'm not suggesting you actually apply this diff. ;-)

Seems to be all that's necessary, though.

--- a/Makefile
+++ b/Makefile
@@ -143,7 +143,7 @@ endif
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
 ifdef LLAMA_CUBLAS
        CUBLAS_FLAGS = -DGGML_USE_CUBLAS -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-       CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
+       CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
        CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
        NVCC      = nvcc
        NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
@@ -266,12 +266,6 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
        $(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_METAL

-ifneq ($(filter aarch64%,$(UNAME_M)),)
-       # Apple M1, M2, etc.
-       # Raspberry Pi 3, 4, Zero 2 (64-bit)
-       CFLAGS   +=
-       CXXFLAGS +=
-endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
        # Raspberry Pi 1, Zero
        CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
diff --git a/otherarch/ggml_v2.h b/otherarch/ggml_v2.h
index dd95ab2c..e45a31ea 100644
--- a/otherarch/ggml_v2.h
+++ b/otherarch/ggml_v2.h
@@ -212,7 +212,7 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
+#if 0
     // we use the built-in 16-bit float type
     typedef __fp16 ggml_v2_fp16_t;

LostRuins commented 4 months ago

Thanks, will take a look

BlairSadewitz commented 3 months ago

This is the proper fix, I think.

diff --git a/otherarch/ggml_v2.h b/otherarch/ggml_v2.h
index dd95ab2c..b4cbb47e 100644
--- a/otherarch/ggml_v2.h
+++ b/otherarch/ggml_v2.h
@@ -212,9 +212,10 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
     typedef __fp16 ggml_v2_fp16_t;
+#elif defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_v2_fp16_t;
 #else
     typedef uint16_t ggml_v2_fp16_t;
 #endif

BlairSadewitz commented 3 months ago

I don't really know C, haha, but I was going crazy trying to figure out which type would work for ALL the code. Then I thought, "Umm, hey, bright light, wtf do u think preprocessor conditionals are for?" Yay.

So maybe there is some other way to do it, but that at least works. Because the problem is gcc vs nvcc.

LostRuins / koboldcpp

linux aarch64 support (NVIDIA GH200) #726