SYSTRAN / faster-whisper

Faster Whisper transcription with CTranslate2
MIT License
12.66k stars 1.06k forks source link

Bump version to 1.1.0 and update benchmarks #1161

Closed MahmoudAshraf97 closed 4 days ago

MahmoudAshraf97 commented 4 days ago

OpenAI Whisper Inference

import torch

# torch.set_num_threads(8)
from whisper import load_model, transcribe, load_audio

audio = load_audio("benchmark/benchmark.m4a")
model = load_model("large-v2, device="cpu")

use_cuda = True

if use_cuda:
    #This is needed to ensure FP16 inference
    state_dict = model.state_dict()
    for i, (parameter_name, parameter) in enumerate(model.named_parameters()):
        if "ln" not in parameter_name:
            state_dict[parameter_name] = state_dict[parameter_name].half()
    model.load_state_dict(state_dict, assign=True)
    model = model.cuda()

result = transcribe(model, audio, beam_size=5, best_of=5, verbose=False)
MahmoudAshraf97 commented 4 days ago

Measure CPU memory for whisper.cpp

#!/bin/bash

# Command to run
COMMAND="./main -m models/ggml-large-v2.bin -l auto -fa ../faster-whisper/benchmark/output.wav"
# Run the command and measure memory consumption
OUTPUT=$(/usr/bin/time -v $COMMAND 2>&1)

# Extract the peak memory usage from the output
PEAK_MEMORY=$(echo "$OUTPUT" | grep "Maximum resident set size" | awk '{print $6}')

# Convert to MB for readability
PEAK_MEMORY_MB=$(bc <<< "scale=2; $PEAK_MEMORY / 1024")

# Print the result
echo "Peak memory consumption: $PEAK_MEMORY_MB MB"

GPU memory

import time
import pynvml

def measure_gpu_memory(command):
    # Initialize NVML
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    peak_memory = 0

    # Get initial memory usage for all GPUs
    initial_memory = 0

    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    initial_memory += mem_info.used

    # Start the process
    import subprocess
    process = subprocess.Popen(command, shell=True)

    try:
        while process.poll() is None:  # While the process is running
            total_memory = 0
            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            total_memory += mem_info.used
            peak_memory = max(peak_memory, total_memory)
            time.sleep(0.5)
    finally:
        pynvml.nvmlShutdown()

    # Calculate memory usage difference (peak - initial)
    memory_difference = (peak_memory - initial_memory) / 1024 / 1024  # Convert to MB
    return memory_difference

if __name__ == "__main__":
    command = "./main -m models/ggml-large-v2.bin -l auto -fa ../faster-whisper/benchmark/output.wav"
    additional_memory = measure_gpu_memory(command)
    print(f"Additional GPU memory used: {additional_memory:.2f} MB")