This PR avoids the costly speaker embedding extraction step when the speaker diarization pipeline is used for voice activity detection with max_speakers = 1.
from pyannote.audio import Pipeline
from pyannote.audio.sample import SAMPLE_FILE
from pyannote.audio.pipelines.utils.hook import ProgressHook
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
with ProgressHook() as hook:
speaker_diarization = pipeline(SAMPLE_FILE, hook=hook)
# segmentation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:01
# speaker_counting ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# embeddings ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:10
# discrete_diarization ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
with ProgressHook() as hook:
voice_activity_detection = pipeline(SAMPLE_FILE, max_speakers=1, hook=hook)
# segmentation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:01
# speaker_counting ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# discrete_diarization ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
# Look ma! No costly embeddings step!
This PR avoids the costly speaker embedding extraction step when the speaker diarization pipeline is used for voice activity detection with
max_speakers = 1
.It can be tested with