C style api on mac is slow

yyccR commented 11 months ago

Hi, thanks for your great work, I wanted to run it on mac using C-style api but it runs slowly.

C-API with the following code piece.

void test_whisper_ggml_infer() {
    std::string model_file = "/whisper/ggml-whisper-tiny.bin";
    std::string wav_file = "/data/audio/test_english.wav";

    std::vector<float> pcmf32;
    std::vector<std::vector<float>> pcmf32s;
    read_wav(wav_file, pcmf32, pcmf32s, false);

    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_realtime = false;
    params.print_progress = false;
    params.print_timestamps = false;
    params.print_special = false;
    params.translate = false;
    params.language = "en";
    params.n_threads = 5;
    params.offset_ms = 0;
    params.no_context = true;
    params.single_segment = false;

    struct whisper_context * ctx = whisper_init_from_file(model_file.c_str());
    if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
        fprintf(stderr, "failed to process audio\n");
        return ;
    }

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        printf("%s", text);
    }

    whisper_free(ctx);
}

and my CMakeLists.txt.

cmake_minimum_required(VERSION 3.17)
project(test_ggml)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O2")

include_directories(ggml-1.4.2)
include_directories(whisper)
include_directories(drwav)

aux_source_directory(${CMAKE_SOURCE_DIR}/ggml-1.4.2 ggml_src)
set(whisper_src ${CMAKE_SOURCE_DIR}/whisper/whisper.cpp)
add_library(ggml SHARED ${ggml_src} ${whisper_src})

add_executable(test_ggml main.cpp)
target_link_libraries(test_ggml ggml)

here is the running log, it cost about 30s for a 22s audio file:

whisper_init_from_file_no_state: loading model from 'xxxx/whisper/ggml-whisper-tiny.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 384
whisper_model_load: n_audio_head  = 6
whisper_model_load: n_audio_layer = 4
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 384
whisper_model_load: n_text_head   = 6
whisper_model_load: n_text_layer  = 4
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 1
whisper_model_load: mem required  =  201.00 MB (+    3.00 MB per decoder)
whisper_model_load: adding 1608 extra tokens
whisper_model_load: model ctx     =   73.62 MB
whisper_model_load: model size    =   73.54 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB

whisper_full_parallel: the audio has been split into 5 chunks at the following times:
whisper_full_parallel: split 1 - 00:00:04.310
whisper_full_parallel: split 2 - 00:00:08.620
whisper_full_parallel: split 3 - 00:00:12.940
whisper_full_parallel: split 4 - 00:00:17.250
whisper_full_parallel: the transcription quality may be degraded near these boundaries
 Please call Stella, ask her to bring these things with her from the store. 6 spoons of fresh snow peas, 5 thick slabs of blue tea. and maybe snack for her brother, Rob. We also need a small plastic snack. and a big toy frog for the kids. She will scoop these things up into three red and we will go meet her Wednesday at the Training Station.

But when I run it on command line like this:

./main -m /whisper/ggml-whisper-tiny.bin -f /data/audio/test_english.wav

it run very fast and cost about 2s.

Thank you.

bobqianic commented 11 months ago

whisper_full_parallel: the audio has been split into 5 chunks at the following times:
whisper_full_parallel: split 1 - 00:00:04.310
whisper_full_parallel: split 2 - 00:00:08.620
whisper_full_parallel: split 3 - 00:00:12.940
whisper_full_parallel: split 4 - 00:00:17.250

whisper_full_parallel isn't very efficient for short audio clips under 30 seconds. It creates multiple KV caches and KV crosses, which kind of defeats the purpose of having KV caches in the first place. It also ends up doing multiple encoding passes when usually just one would suffice for a 30-second clip. Additionally, it calculates the log-mel spectrogram multiple times, which is not only unnecessary but also contributes to the overall slowdown.

    struct whisper_context * ctx = whisper_init_from_file(model_file.c_str());
    if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
        fprintf(stderr, "failed to process audio\n");
        return ;
    }

But you actually called whisper_full, not whisper_full_parallel. I'm not sure why it ended up using whisper_full_parallel to process your audio.

yyccR commented 11 months ago

Sorry, that log was generated when I tested whisper_full_parallel

here is the whisper_full running log

whisper_init_from_file_no_state: loading model from 'xxxx/whisper/ggml-whisper-tiny.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 384
whisper_model_load: n_audio_head  = 6
whisper_model_load: n_audio_layer = 4
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 384
whisper_model_load: n_text_head   = 6
whisper_model_load: n_text_layer  = 4
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 1
whisper_model_load: mem required  =  201.00 MB (+    3.00 MB per decoder)
whisper_model_load: adding 1608 extra tokens
whisper_model_load: model ctx     =   73.62 MB
whisper_model_load: model size    =   73.54 MB
whisper_init_state: kv self size  =    2.62 MB
whisper_init_state: kv cross size =    8.79 MB
 Please call Stella, ask her to bring these things with her from the store. Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother, Bob. We also need a small plastic snake and a big toy frog for the kids. She will scoop these things up into three red bags, and we will go meet her Wednesday at the training station.

and the test audio file is here

bobqianic commented 11 months ago

If you're actually using whisper_full and finding it slow, the issue might be with your cmake configuration. You could try using find_package to link with OpenBLAS, which should speed up matrix multiplication.

if (WHISPER_BLAS)
    if (WIN32)
        if(DEFINED ENV{OPENBLAS_PATH})
            set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories($ENV{OPENBLAS_PATH}/include)
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else ()
            message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
        endif ()
    else ()
        set(BLA_STATIC 1)
        set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
        #    set(BLA_PREFER_PKGCONFIG 1)
        set(BLA_SIZEOF_INTEGER 8)
        find_package(BLAS)

        if(BLAS_FOUND)
            message(STATUS "BLAS compatible library found")
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
            find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories(${BLAS_INCLUDE_DIRS})
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else()
            message(WARNING "BLAS library was not found")
        endif()
    endif ()
endif ()

yyccR commented 11 months ago

I have try to compile with openbias， and it runs not as fast as command line. Maybe I can build the librarywhisper.dylib and link all the dependences

yyccR commented 11 months ago

I build the libwhisper.dylib and this problem solved.

cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON ..

-- The C compiler identification is AppleClang 12.0.5.12050022
-- The CXX compiler identification is AppleClang 12.0.5.12050022
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.30.1 (Apple Git-130)") 
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- Accelerate framework found
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- x86 detected
-- Configuring done
-- Generating done
-- Build files have been written to: /opt/whisper.cpp/build

make
Scanning dependencies of target whisper
[  8%] Building C object CMakeFiles/whisper.dir/ggml.c.o
[ 16%] Building CXX object CMakeFiles/whisper.dir/whisper.cpp.o
[ 25%] Linking CXX shared library libwhisper.dylib
[ 25%] Built target whisper
Scanning dependencies of target common
[ 33%] Building CXX object examples/CMakeFiles/common.dir/common.cpp.o
[ 41%] Building CXX object examples/CMakeFiles/common.dir/common-ggml.cpp.o
[ 50%] Linking CXX static library libcommon.a
[ 50%] Built target common
Scanning dependencies of target main
[ 58%] Building CXX object examples/main/CMakeFiles/main.dir/main.cpp.o
[ 66%] Linking CXX executable ../../bin/main
[ 66%] Built target main
Scanning dependencies of target bench
[ 75%] Building CXX object examples/bench/CMakeFiles/bench.dir/bench.cpp.o
[ 83%] Linking CXX executable ../../bin/bench
[ 83%] Built target bench
Scanning dependencies of target quantize
[ 91%] Building CXX object examples/quantize/CMakeFiles/quantize.dir/quantize.cpp.o
[100%] Linking CXX executable ../../bin/quantize
[100%] Built target quantize

ggerganov / whisper.cpp

C style api on mac is slow #1257