pyannote / pyannote-audio

Neural building blocks for speaker diarization: speech activity detection, speaker change detection, overlapped speech detection, speaker embedding
http://pyannote.github.io
MIT License
6.38k stars 784 forks source link

improve(pipeline): do not extract embeddings in `SpeakerDiarization` pipeline when `max_speakers` is 1 #1686

Closed hbredin closed 6 months ago

hbredin commented 7 months ago

This PR avoids the costly speaker embedding extraction step when the speaker diarization pipeline is used for voice activity detection with max_speakers = 1.


from pyannote.audio import Pipeline
from pyannote.audio.sample import SAMPLE_FILE
from pyannote.audio.pipelines.utils.hook import ProgressHook

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")

with ProgressHook() as hook:
    speaker_diarization = pipeline(SAMPLE_FILE, hook=hook)

# segmentation         ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:01
# speaker_counting     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# embeddings           ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:10
# discrete_diarization ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00

with ProgressHook() as hook:
    voice_activity_detection = pipeline(SAMPLE_FILE, max_speakers=1, hook=hook)

# segmentation         ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:01
# speaker_counting     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# discrete_diarization ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# Look ma! No costly embeddings step!

It can be tested with

pip install https://github.com/pyannote/pyannote-audio/archive/feat/skip-embedding-when-max-speakers-is-1.tar.gz