Open homocomputeris opened 1 year ago
@homocomputeris
Kindly do this and paste result here:
make clean
WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j
Alternatively test this: https://github.com/ggerganov/whisper.cpp/pull/891 with WHISPER_NO_ACCELERATE=1 and without.
Sure:
$ make clean && WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -lclblast -lOpenCL
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml.c -o ggml.o
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml-opencl.c -o ggml-opencl.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/main/main.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o main -lclblast -lOpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/bench/bench.cpp ggml.o ggml-opencl.o whisper.o -o bench -lclblast -lOpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/quantize/quantize.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o quantize -lclblast -lOpenCL
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [bench] Error 1
make: *** Waiting for unfinished jobs....
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [quantize] Error 1
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [main] Error 1
WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j 39.90s user 2.39s system 179% cpu 23.550 total
Without accelerate:
% make clean && WHISPER_CLBLAST=1 make -j
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE -DGGML_USE_CLBLAST
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate -lclblast -lOpenCL
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE -DGGML_USE_CLBLAST -c ggml.c -o ggml.o
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE -DGGML_USE_CLBLAST -c ggml-opencl.c -o ggml-opencl.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
ggml.c:4406:9: error: call to undeclared function 'ggml_cl_init'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
ggml_cl_init();
^
ggml.c:4406:9: note: did you mean 'ggml_init'?
ggml.c:4357:23: note: 'ggml_init' declared here
struct ggml_context * ggml_init(struct ggml_init_params params) {
^
ggml.c:8319:17: error: call to undeclared function 'ggml_cl_sgemm_wrapper'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8319:39: error: use of undeclared identifier 'GGML_BLAS_ORDER_ROW_MAJOR'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8319:66: error: use of undeclared identifier 'GGML_BLAS_OP_N'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8319:82: error: use of undeclared identifier 'GGML_BLAS_OP_T'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8509:17: error: call to undeclared function 'ggml_cl_sgemm_wrapper'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8509:39: error: use of undeclared identifier 'GGML_BLAS_ORDER_ROW_MAJOR'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8509:66: error: use of undeclared identifier 'GGML_BLAS_OP_N'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8509:82: error: use of undeclared identifier 'GGML_BLAS_OP_T'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8733:17: error: call to undeclared function 'ggml_cl_sgemm_wrapper'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8733:39: error: use of undeclared identifier 'GGML_BLAS_ORDER_ROW_MAJOR'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8733:66: error: use of undeclared identifier 'GGML_BLAS_OP_N'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
ggml.c:8733:82: error: use of undeclared identifier 'GGML_BLAS_OP_T'
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
^
13 errors generated.
make: *** [ggml.o] Error 1
make: *** Waiting for unfinished jobs....
WHISPER_CLBLAST=1 make -j 11.45s user 0.76s system 118% cpu 10.263 total
% make clean && WHISPER_CLBLAST_NETLIB=1 WHISPER_NO_ACCELERATE=1 make
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS:
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/main/main.cpp examples/common.cpp examples/common-ggml.cpp ggml.o whisper.o -o main
./main -h
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [2 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-olrc, --output-lrc [false ] output result in a lrc file
-owts, --output-words [false ] output script for generating karaoke video
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
-dl, --detect-language [false ] exit after automatically detecting language
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/bench/bench.cpp ggml.o whisper.o -o bench
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/quantize/quantize.cpp examples/common.cpp examples/common-ggml.cpp ggml.o whisper.o -o quantize
\
WHISPER_CLBLAST_NETLIB=1 WHISPER_NO_ACCELERATE=1 make 38.52s user 1.53s system 98% cpu 40.524 total
ld: library not found for -lOpenCL
Okay I read up a bit on this, I am not an expert about macs, first OpenCL is no longer supported by apple as they have transitioned to metal: https://developer.apple.com/opencl/
It seems that OpenCL is already available, but the headers are not.
So give this a try and build again:
brew install opencl-headers
OpenCL lib is not found.
ld: library not found for -lOpenCL
So things are slightly different on mac, seems -framework OpenCL
needs to be added.
WHISPER_CLBLAST_NETLIB=1 WHISPER_NO_ACCELERATE=1 make
Edited: Nothing is picked up
LDFLAGS:
LDFLAGS are empty. @homocomputeris Are you sure you built from here: https://github.com/ggerganov/whisper.cpp/pull/891 https://github.com/trholding/whisper.cpp
@ggerganov Can you help with this, did I do some stupid mistake in the CMake/Makefile?
LDFLAGS are empty. @homocomputeris Are you sure you built from here: #891 https://github.com/trholding/whisper.cpp
in that repo I get:
user@pc ~/whisper.cpp (git)-[master] % git remote -v
origin https://github.com/trholding/whisper.cpp.git (fetch)
origin https://github.com/trholding/whisper.cpp.git (push)
user@pc ~/whisper.cpp (git)-[master] % echo $CPPFLAGS
-I/usr/local/opt/opencl-headers/include
user@pc ~/whisper.cpp (git)-[master] % make clean && WHISPER_CLBLAST_NETLIB=1 WHISPER_NO_ACCELERATE=1 make
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLASTNETLIB
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -lclblast -lOpenCL
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLASTNETLIB -c ggml.c -o ggml.o
ggml.c:147:10: fatal error: 'clblast_netlib_c.h' file not found
#include <clblast_netlib_c.h>
^~~~~~~~~~~~~~~~~~~~
1 error generated.
make: *** [ggml.o] Error 1
@homocomputeris
I have reverted the last changes in my repo to fix something. I'll work on these issues today.
ggml.c:147:10: fatal error: 'clblast_netlib_c.h' file not found this means that CLBlast needs to be compiled from scratch with the -DNETLIB=ON and installed so that the header could be found.
On 1d17cd5bb37a3212679d6055ad69ba5a8d58eb71, macOS 13.3.1:
% make clean && WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -lclblast -lOpenCL
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml.c -o ggml.o
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml-opencl.c -o ggml-opencl.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/main/main.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o main -lclblast -lOpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/bench/bench.cpp ggml.o ggml-opencl.o whisper.o -o bench -lclblast -lOpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/quantize/quantize.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o quantize -lclblast -lOpenCL
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [bench] Error 1
make: *** Waiting for unfinished jobs....
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [quantize] Error 1
ld: library not found for -lOpenCL
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [main] Error 1
After replacing -lOpenCL
in the Makefile with -framework OpenCL
, it compiles:
% make clean && WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
sysctl: unknown oid 'hw.optional.arm64'
I whisper.cpp build info:
I UNAME_S: Darwin
I UNAME_P: i386
I UNAME_M: x86_64
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
I LDFLAGS: -lclblast -framework OpenCL
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml.c -o ggml.o
cc -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -mf16c -mfma -mavx -mavx2 -DGGML_USE_CLBLAST -c ggml-opencl.c -o ggml-opencl.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/main/main.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o main -lclblast -framework OpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/bench/bench.cpp ggml.o ggml-opencl.o whisper.o -o bench -lclblast -framework OpenCL
c++ -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread examples/quantize/quantize.cpp examples/common.cpp examples/common-ggml.cpp ggml.o ggml-opencl.o whisper.o -o quantize -lclblast -framework OpenCL
./main -h
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [2 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-olrc, --output-lrc [false ] output result in a lrc file
-owts, --output-words [false ] output script for generating karaoke video
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
-dl, --detect-language [false ] exit after automatically detecting language
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
But then crashes:
% ./main NL.wav -m models/ggml-base.bin -l auto
whisper_init_from_file_no_state: loading model from 'models/ggml-base.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51865
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 512
whisper_model_load: n_text_head = 8
whisper_model_load: n_text_layer = 6
whisper_model_load: n_mels = 80
whisper_model_load: ftype = 1
whisper_model_load: type = 2
whisper_model_load: mem required = 310.00 MB (+ 6.00 MB per decoder)
whisper_model_load: adding 1608 extra tokens
whisper_model_load: model ctx = 140.60 MB
Initializing CLBlast (First Run)...
Attempting to use: Platform=0, Device=0 (If invalid, program will crash)
Using Platform: Apple Device: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
OpenCL clCreateCommandQueue error -30 at ggml-opencl.c:229
Selecting my GPU instead of CPU fails with the same error:
% GGML_CLBLAST_DEVICE=2 ./main NL.wav -m models/ggml-tiny.bin -l auto
whisper_init_from_file_no_state: loading model from 'models/ggml-tiny.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51865
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 384
whisper_model_load: n_audio_head = 6
whisper_model_load: n_audio_layer = 4
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 384
whisper_model_load: n_text_head = 6
whisper_model_load: n_text_layer = 4
whisper_model_load: n_mels = 80
whisper_model_load: ftype = 1
whisper_model_load: type = 1
whisper_model_load: mem required = 201.00 MB (+ 3.00 MB per decoder)
whisper_model_load: adding 1608 extra tokens
whisper_model_load: model ctx = 73.58 MB
Initializing CLBlast (First Run)...
Attempting to use: Platform=0, Device=2 (If invalid, program will crash)
Using Platform: Apple Device: AMD Radeon Pro 5500M Compute Engine
OpenCL clCreateCommandQueue error -30 at ggml-opencl.c:229
It seems that -30 implies 'CL_INVALID_VALUE` (based on https://stackoverflow.com/questions/24326432/convenient-way-to-show-opencl-error-codes). Perhaps this is another bug and out of scope for the current issue. 🤷🏼
But then... removing CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
got it to run!
Hello, I built and run this on a MacOS 13.2 Intel. These are the steps I followed:
In the Makefile
I replaced -lOpenCL
with -framework OpenCL
diff --git a/Makefile b/Makefile
index 0787136..25ebe7a 100644
--- a/Makefile
+++ b/Makefile
@@ -173,7 +173,7 @@ endif
ifdef WHISPER_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
LDFLAGS += -lclblast -lOpenCL
LDFLAGS += -lclblast -framework OpenCL WHISPER_OBJ += ggml-opencl.o
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
In ggml-opencl.c
I removed the property CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
from clCreateCommandQueue()
because at runtime I got error -30 (CL_INVALID_VALUE
)
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 4389eca..73b4dd0 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -225,7 +225,7 @@ void ggml_cl_init(void) {
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
queue = clCreateCommandQueue(context, device, 0, &err); CL_CHECK(err, "clCreateCommandQueue");
free(platforms);
Then I built it with WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j
. This because framework Accelerate and CLBlast must be mutually exclusive
At runtime OpenCL finds three devices: 0=CPU Intel, 1=Integrated GPU Intel, 2=GPU AMD Radeon. By default device 0 is chosen (CPU), but a device can be selected by setting the property GGML_CLBLAST_DEVICE
, for example:
GGML_CLBLAST_DEVICE=2 ./main -f ./samples/hp0.wav
I performed some simple tests in different configuration with time ./main -f ./samples/hp0.wav
without and with framework Accelerate
(the default makefile configuration) and with CLBlast
and framework OpenCL
with all passible devices with the following results:
build | command | user | system | cpu | total |
---|---|---|---|---|---|
WHISPER_NO_ACCELERATE=1 make -j | ./main -f ./samples/hp0.wav | 147,50s | 2,49s | 381% | 39,320 |
make -j | ./main -f ./samples/hp0.wav | 97,92s | 1,29s | 499% | 19,858 |
WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j | GGML_CLBLAST_DEVICE=2 ./main -f ./samples/hp0.wav | 99,73s | 3,90s | 338% | 30,620 |
WHISPER_NO_ACCELERATE=1 WHISPER_CLBLAST=1 make -j | GGML_CLBLAST_DEVICE=1 ./main -f ./samples/hp0.wav | 136,73s | 4,28s | 330% | 42,661 |
With Device 0 (OpenCL on CPU) the test did not complete in a reasonable time.
The results show that the fastest execution is the one on CPU with framework Accelerate
enabled (default configuration), which is 2x faster than with no Accelerate and 1.5x faster than OpenCL on GPU AMD Radeon. Obviously CPU usage is lower (~ 34% lower) when using GPU.
I was able to successfully build and run it on 2b6a074 by following @fralken's instructions on a MacOS 13.3 Intel.
However, after e693074, it crashes when run. Here's the full output:
$ GGML_CLBLAST_DEVICE=1 ./main -f samples/jfk.wav
whisper_init_from_file_no_state: loading model from 'models/ggml-base.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 512
whisper_model_load: n_text_head = 8
whisper_model_load: n_text_layer = 6
whisper_model_load: n_mels = 80
whisper_model_load: ftype = 1
whisper_model_load: qntvr = 0
whisper_model_load: type = 2
whisper_model_load: mem required = 310.00 MB (+ 6.00 MB per decoder)
whisper_model_load: adding 1607 extra tokens
whisper_model_load: model ctx = 140.66 MB
Initializing CLBlast (First Run)...
Attempting to use: Platform=0, Device=1 (If invalid, program will crash)
Using Platform: Apple Device: Intel(R) Iris(TM) Plus Graphics 655
<program source>:1:133: error: variable length arrays are not supported in OpenCL
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:223: error: variable length arrays are not supported in OpenCL
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:341: error: variable length arrays are not supported in OpenCL
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:442: error: variable length arrays are not supported in OpenCL
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:523: error: variable length arrays are not supported in OpenCL
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:635: error: automatic variable qualified with an address space
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:972: error: automatic variable qualified with an address space
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:1333: error: automatic variable qualified with an address space
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:1855: error: automatic variable qualified with an address space
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
<program source>:1:2420: error: automatic variable qualified with an address space
typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; constant uint QK4_0 = 32; struct block_q4_0 { float d; uint8_t qs[QK4_0 / 2]; }; constant uint QK4_1 = 32; struct block_q4_1 { float d; float m; uint8_t qs[QK4_1 / 2]; }; constant uint QK5_0 = 32; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; constant uint QK5_1 = 32; struct block_q5_1 { half d; half m; uint32_t qh; uint8_t qs[QK5_1 / 2]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; uint8_t qs[QK8_0]; }; __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { constant uint qk = QK4_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { constant uint qk = QK4_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; const float m = x[i].m; const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { constant uint qk = QK5_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; } __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { constant uint qk = QK5_1; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = vload_half(0, (__global half*) &x[i].d); const float m = vload_half(0, (__global half*) &x[i].m); uint32_t qh = x[i].qh; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { constant uint qk = QK8_0; const uint i = get_global_id(0) / qk; const uint j = get_local_id(0); const float d = x[i].d; y[i*qk + j] = x[i].qs[j]*d; }
^
Fix is in progress: https://github.com/ggerganov/llama.cpp/pull/1435#issuecomment-1546928978
Cannot build with OpenCL support on MacOS Intel. Related to https://github.com/ggerganov/whisper.cpp/pull/863
System
Make
Install clblast from brew
Build
CMake