Excessive Loading Time for Each Transcription with whisperkit-cli

NextDoorLaoHuang-HF commented 6 days ago

Environment:

MacBook Pro 14 (2021) 16GB
Apple M1 Pro
macOS 14.2.1 (23C71)

Issue Description:

I compiled whisperkit-cli using the whisperkittools pipeline. Subsequently, I noticed that each time the pipeline is started, it involves recompiling whisperkit-cli, downloading (or checking for updates of) the model, and each transcription takes an extremely long time, measured in hours (I tried transcribing an audio file of several tens of seconds). The transcription process of this pipeline is basically to start whisperkit-cli through the command line. To avoid recompilation and model downloading at each startup, I tried to start whisperkit-cli using Python's subprocess module and specifying the model directory. However, after multiple startups, I found that this method still results in startup times measured in hours, and I observed that ANECompilerService starts up every time and does not close immediately after transcription (Is it converting the model?).

Below is the startup code I used:

import os
import subprocess
import json
import re
from whisperkittools.whisperkit.test_utils import CoreMLSwiftComputeUnit

class WhisperTranscriber:
    def __init__(self, whisper_version, out_dir):
        self.whisper_version = whisper_version
        self.out_dir = out_dir
        self.model_dir = os.path.join(out_dir, "WhisperKit", "Models", whisper_version)
        self.cli_path = os.path.join(out_dir, "WhisperKit", ".build", "release", "whisperkit-cli")
        self.results_dir = os.path.join(self.model_dir, "results")
        self.compute_unit = CoreMLSwiftComputeUnit.ANE

    def convert_video_to_audio(self, video_path, audio_path):
        command = f"ffmpeg -i {video_path} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_path}"
        subprocess.call(command, shell=True)

    def create_srt_file(self, result, out_file):
        def clean_text(text):
            # Remove all special tags
            cleaned = re.sub(r'<\|.*?\|>', '', text)
            # Remove excess whitespace
            cleaned = ' '.join(cleaned.split())
            return cleaned

        with open(out_file, "w", encoding="utf-8") as f:
            for i, segment in enumerate(result["segments"], start=1):
                start_time = self.format_time(segment["start"])
                end_time = self.format_time(segment["end"])
                text = clean_text(segment["text"].strip())
                if text:  # Write only if text is not empty
                    f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
        print(f"Subtitle file saved to: {out_file}")

    @staticmethod
    def format_time(seconds):
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = seconds % 60
        milliseconds = int((seconds - int(seconds)) * 1000)
        return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"

    def transcribe_audio(self, audio_path):
        cmd = " ".join([
            self.cli_path,
            "transcribe",
            "--audio-path", audio_path,
            "--model-path", self.model_dir,
            "--text-decoder-compute-units", self.compute_unit.value,
            "--audio-encoder-compute-units", self.compute_unit.value,
            "--report-path", self.results_dir, "--report",
        ])

        print(f"Executing command: {cmd}")
        subprocess.run(cmd, shell=True, check=True)

        result_path = os.path.join(
            self.results_dir,
            os.path.splitext(os.path.basename(audio_path))[0] + ".json"
        )

        if not os.path.exists(result_path):
            raise FileNotFoundError(f"Transcription result file not found: {result_path}")

        with open(result_path, "r") as f:
            result = json.load(f)

        print(result)

        output_file = os.path.join(self.out_dir, "transcription_result.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(result["text"])

        print(f"Transcription result saved to: {output_file}")

        srt_file = os.path.join(self.out_dir, "subtitles.srt")
        self.create_srt_file(result, srt_file)

        return result

    def transcribe_video(self, video_path):
        audio_path = os.path.join(self.out_dir, "temp_audio.wav")

        self.convert_video_to_audio(video_path, audio_path)

        result = self.transcribe_audio(audio_path)

        os.remove(audio_path)

        return result

if __name__ == "__main__":
    whisper_version = "openai_whisper-large-v3-v20240930_turbo_632MB"
    out_dir = "/Users/MovieCatcher/sub"

    transcriber = WhisperTranscriber(whisper_version, out_dir)

    video_path = "/Users/Downloads/xiaopeng.mp4"
    transcriber.transcribe_video(video_path)

Timing Information Output:

{
  "timings": {
    "encoding": 52.55032193660736,
    "decodingFiltering": 7.855892181396484e-05,
    "totalLogmelRuns": 2,
    "decoderLoadTime": 1.049233078956604,
    "decodingWordTimestamps": 0,
    "totalKVUpdateRuns": 70,
    "logmels": 0.15710794925689697,
    "encoderLoadTime": 1981.2565809488297,
    "prewarmLoadTime": 0,
    "decodingWindowing": 0.000102996826171875,
    "decodingLoop": 54.988917112350464,
    "audioLoading": 0.03239607810974121,
    "fullPipeline": 33.905139088630676,
    "prefill": 0,
    "firstTokenTime": 750516220.787694,
    "totalDecodingFallbacks": 0,
    "inputAudioSeconds": 41.0026875,
    "tokenizerLoadTime": 0.18291699886322021,
    "decodingPredictions": 2.066950559616089,
    "decodingKvCaching": 0.03084409236907959,
    "decodingInit": 0.009521007537841797,
    "audioProcessing": 0.0002499818801879883,
    "totalEncodingRuns": 2,
    "pipelineStart": 750516201.134715,
    "totalDecodingWindows": 2,
    "totalAudioProcessingRuns": 2,
    "modelLoading": 1982.6185909509659,
    "totalDecodingLoops": 72,
    "totalTimestampAlignmentRuns": 0,
    "decodingNonPrediction": 0.19201219081878662,
    "decodingFallback": 0,
    "decodingSampling": 0.04196739196777344
  }
}

atiorh commented 6 days ago

Hi @NextDoorLaoHuang-HF,

This sounds like a model version and chip generation mismatch.
For the models that we published, we maintain a device support map here
I believe your device is officially called MacBookPro18 and it does not support the *turbo* optimizations.
The exact same model without these optimizations is called openai_whisper-large-v3-v20240930_626MB
The CLI is built and cached via Python wrapper you are using so there should be no duplicate setup on second use.
Please try the suitable model version I recommended above and let us know how it goes!

NextDoorLaoHuang-HF commented 4 days ago

Thank you very much for your prompt response. After switching to the non-turbo model, it indeed functions smoothly, and the experience is excellent.

atiorh commented 3 days ago

Glad to hear! Please let us know if you run into any other issue.

argmaxinc / WhisperKit

Excessive Loading Time for Each Transcription with whisperkit-cli #222