argmaxinc / WhisperKit

On-device Speech Recognition for Apple Silicon
https://takeargmax.com/blog/whisperkit
MIT License
3.72k stars 311 forks source link

Excessive Loading Time for Each Transcription with whisperkit-cli #222

Closed NextDoorLaoHuang-HF closed 3 days ago

NextDoorLaoHuang-HF commented 6 days ago

Environment:

Issue Description:

I compiled whisperkit-cli using the whisperkittools pipeline. Subsequently, I noticed that each time the pipeline is started, it involves recompiling whisperkit-cli, downloading (or checking for updates of) the model, and each transcription takes an extremely long time, measured in hours (I tried transcribing an audio file of several tens of seconds). The transcription process of this pipeline is basically to start whisperkit-cli through the command line. To avoid recompilation and model downloading at each startup, I tried to start whisperkit-cli using Python's subprocess module and specifying the model directory. However, after multiple startups, I found that this method still results in startup times measured in hours, and I observed that ANECompilerService starts up every time and does not close immediately after transcription (Is it converting the model?).

Below is the startup code I used:

import os
import subprocess
import json
import re
from whisperkittools.whisperkit.test_utils import CoreMLSwiftComputeUnit

class WhisperTranscriber:
    def __init__(self, whisper_version, out_dir):
        self.whisper_version = whisper_version
        self.out_dir = out_dir
        self.model_dir = os.path.join(out_dir, "WhisperKit", "Models", whisper_version)
        self.cli_path = os.path.join(out_dir, "WhisperKit", ".build", "release", "whisperkit-cli")
        self.results_dir = os.path.join(self.model_dir, "results")
        self.compute_unit = CoreMLSwiftComputeUnit.ANE

    def convert_video_to_audio(self, video_path, audio_path):
        command = f"ffmpeg -i {video_path} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_path}"
        subprocess.call(command, shell=True)

    def create_srt_file(self, result, out_file):
        def clean_text(text):
            # Remove all special tags
            cleaned = re.sub(r'<\|.*?\|>', '', text)
            # Remove excess whitespace
            cleaned = ' '.join(cleaned.split())
            return cleaned

        with open(out_file, "w", encoding="utf-8") as f:
            for i, segment in enumerate(result["segments"], start=1):
                start_time = self.format_time(segment["start"])
                end_time = self.format_time(segment["end"])
                text = clean_text(segment["text"].strip())
                if text:  # Write only if text is not empty
                    f.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
        print(f"Subtitle file saved to: {out_file}")

    @staticmethod
    def format_time(seconds):
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = seconds % 60
        milliseconds = int((seconds - int(seconds)) * 1000)
        return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"

    def transcribe_audio(self, audio_path):
        cmd = " ".join([
            self.cli_path,
            "transcribe",
            "--audio-path", audio_path,
            "--model-path", self.model_dir,
            "--text-decoder-compute-units", self.compute_unit.value,
            "--audio-encoder-compute-units", self.compute_unit.value,
            "--report-path", self.results_dir, "--report",
        ])

        print(f"Executing command: {cmd}")
        subprocess.run(cmd, shell=True, check=True)

        result_path = os.path.join(
            self.results_dir,
            os.path.splitext(os.path.basename(audio_path))[0] + ".json"
        )

        if not os.path.exists(result_path):
            raise FileNotFoundError(f"Transcription result file not found: {result_path}")

        with open(result_path, "r") as f:
            result = json.load(f)

        print(result)

        output_file = os.path.join(self.out_dir, "transcription_result.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(result["text"])

        print(f"Transcription result saved to: {output_file}")

        srt_file = os.path.join(self.out_dir, "subtitles.srt")
        self.create_srt_file(result, srt_file)

        return result

    def transcribe_video(self, video_path):
        audio_path = os.path.join(self.out_dir, "temp_audio.wav")

        self.convert_video_to_audio(video_path, audio_path)

        result = self.transcribe_audio(audio_path)

        os.remove(audio_path)

        return result

if __name__ == "__main__":
    whisper_version = "openai_whisper-large-v3-v20240930_turbo_632MB"
    out_dir = "/Users/MovieCatcher/sub"

    transcriber = WhisperTranscriber(whisper_version, out_dir)

    video_path = "/Users/Downloads/xiaopeng.mp4"
    transcriber.transcribe_video(video_path)

Timing Information Output:

{
  "timings": {
    "encoding": 52.55032193660736,
    "decodingFiltering": 7.855892181396484e-05,
    "totalLogmelRuns": 2,
    "decoderLoadTime": 1.049233078956604,
    "decodingWordTimestamps": 0,
    "totalKVUpdateRuns": 70,
    "logmels": 0.15710794925689697,
    "encoderLoadTime": 1981.2565809488297,
    "prewarmLoadTime": 0,
    "decodingWindowing": 0.000102996826171875,
    "decodingLoop": 54.988917112350464,
    "audioLoading": 0.03239607810974121,
    "fullPipeline": 33.905139088630676,
    "prefill": 0,
    "firstTokenTime": 750516220.787694,
    "totalDecodingFallbacks": 0,
    "inputAudioSeconds": 41.0026875,
    "tokenizerLoadTime": 0.18291699886322021,
    "decodingPredictions": 2.066950559616089,
    "decodingKvCaching": 0.03084409236907959,
    "decodingInit": 0.009521007537841797,
    "audioProcessing": 0.0002499818801879883,
    "totalEncodingRuns": 2,
    "pipelineStart": 750516201.134715,
    "totalDecodingWindows": 2,
    "totalAudioProcessingRuns": 2,
    "modelLoading": 1982.6185909509659,
    "totalDecodingLoops": 72,
    "totalTimestampAlignmentRuns": 0,
    "decodingNonPrediction": 0.19201219081878662,
    "decodingFallback": 0,
    "decodingSampling": 0.04196739196777344
  }
}
atiorh commented 6 days ago

Hi @NextDoorLaoHuang-HF,

NextDoorLaoHuang-HF commented 4 days ago

Thank you very much for your prompt response. After switching to the non-turbo model, it indeed functions smoothly, and the experience is excellent.

atiorh commented 3 days ago

Glad to hear! Please let us know if you run into any other issue.