ggerganov / whisper.cpp

Port of OpenAI's Whisper model in C/C++
MIT License
32.95k stars 3.3k forks source link

compilation error on aarch64 #1018

Open byte-6174 opened 1 year ago

byte-6174 commented 1 year ago

I am trying to compile whisper.cpp on jetson nx device (aarch64). When I compile the make file I am erroring out with following:

ggml.h(215): error: identifier "__fp16" is undefined
      typedef __fp16 ggml_fp16_t;
              ^

1 error detected in the compilation of "ggml-cuda.cu".
make: *** [Makefile:178: ggml-cuda.o] Error 2
make: *** Waiting for unfinished jobs....

Anything you can suggeest to get past this?

paulcombe commented 1 year ago

Referencing https://github.com/ggerganov/llama.cpp/issues/1455 and other issues related to aarch64, I made the following changes:

diff --git a/Makefile b/Makefile
index 7bb7e31..837005e 100644
--- a/Makefile
+++ b/Makefile
@@ -160,12 +160,12 @@ ifdef WHISPER_OPENBLAS
 endif

 ifdef WHISPER_CUBLAS
-       CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-       CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+       CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/aarch64-linux/include
+       CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/aarch64-linux/include
        LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
        WHISPER_OBJ += ggml-cuda.o
        NVCC        = nvcc
-       NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=native
+       NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=sm_53

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
        $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@

diff --git a/ggml.h b/ggml.h
index 51a616c..dae3b9d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -212,9 +212,11 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
     // we use the built-in 16-bit float type
     typedef __fp16 ggml_fp16_t;
+#elif defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
 #else
     typedef uint16_t ggml_fp16_t;
 #endif

However, this leads to new errors:

I whisper.cpp build info:
I UNAME_S:  Linux
I UNAME_P:  aarch64
I UNAME_M:  aarch64
I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/aarch64-linux/include
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/aarch64-linux/include
I LDFLAGS:   -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L/targets/x86_64-linux/lib
I CC:       cc (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0
I CXX:      g++ (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0

cc  -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/aarch64-linux/include   -c ggml.c -o ggml.o
nvcc --forward-unknown-to-host-compiler -arch=sm_53 -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/aarch64-linux/include -Wno-pedantic -c ggml-cuda.cu -o ggml-cuda.o
g++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/aarch64-linux/include -c whisper.cpp -o whisper.o
ggml-cuda.cu(430): error: identifier "CUBLAS_TF32_TENSOR_OP_MATH" is undefined

ggml-cuda.cu(676): error: identifier "CUBLAS_COMPUTE_32F_FAST_16F" is undefined

2 errors detected in the compilation of "/tmp/tmpxft_000002ea_00000000-6_ggml-cuda.cpp1.ii".
Makefile:171: recipe for target 'ggml-cuda.o' failed
make: *** [ggml-cuda.o] Error 1

I spent a long time hamfisting CUDA docs to see if I could add those missing errors, but the error at line 676 consistently failed (either at compile time or at runtime) even after trying to hack in those definitions. I'm at a loss from here, but I'm pretty sure part of the error is that the Jetson Nano Devkit/Tegra X1 is on CUDA 10.2 but the Orins people are using successfully are on CUDA 11 or later.

leochencipher commented 11 months ago

@paulcombe For Jetson Nano, also make following changes:

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 50df20e..50fc308 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1855,7 +1855,7 @@ void ggml_init_cublas() {

             // create cublas handle
             CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
-            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TENSOR_OP_MATH));
         }

         // configure logging to stdout
@@ -2375,7 +2375,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm

         // wait for main GPU data if necessary
         if (split && id != g_main_device) {
-            CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device],0));
         }

         if (src0_on_device && src0_is_contiguous) {
@@ -2577,7 +2577,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
         CUDA_CHECK(cudaSetDevice(g_main_device));
         for (int id = 0; id < g_device_count; ++id) {
             if (id != g_main_device) {
-                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id],0));
             }
         }
     }
vieenrose commented 11 months ago

I confirmed that with the modifications suggested above, that I've applied to b948361, it works on my platform described in below

We've verified using jtop that

divyansh2681 commented 10 months ago

@paulcombe After making the above mentioned changes, I got the following error: image

Edit:

I commented out

ifneq ($(filter aarch64%,$(UNAME_M)),)'
    CFLAGS   += -mcpu=native
    CXXFLAGS += -mcpu=native
endif

and it worked.