lovemefan / fsmn-vad

A enterprise-grade Voice Activity Detector from modelscope and funasr.
MIT License
49 stars 5 forks source link


FSMN VAD


python3.7 python3.8 python3.9 python3.10

A enterprise-grade Voice Activity Detector from modelscope and funasr.


Key Features


Installation

git clone https://github.com/lovemefan/fsmn-vad
cd fsmn-vad
python setup.py install

Usage

from fsmnvad import FSMNVad
from pathlib import Path
vad = FSMNVad()
segments = vad.segments_offline(Path("/path/audio/vad_example.wav"))
print(segments)
from fsmnvad import FSMNVadOnline
from fsmnvad import AudioReader
in_cache = []
speech, sample_rate = AudioReader.read_wav_file('/path/audio/vad_example.wav')
speech_length = speech.shape[0]

sample_offset = 0
step = 1600
vad_online = FSMNVadOnline()

for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
    if sample_offset + step >= speech_length - 1:
        step = speech_length - sample_offset
        is_final = True
    else:
        is_final = False
    segments_result, in_cache = vad_online.segments_online(
        speech[sample_offset: sample_offset + step],
        in_cache=in_cache, is_final=is_final)
    if segments_result:
        print(segments_result)

Citation

@inproceedings{zhang2018deep,
  title={Deep-FSMN for large vocabulary continuous speech recognition},
  author={Zhang, Shiliang and Lei, Ming and Yan, Zhijie and Dai, Lirong},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5869--5873},
  year={2018},
  organization={IEEE}
}
@misc{FunASR,
  author = {Speech Lab, Alibaba Group, China},
  title = {FunASR: A Fundamental End-to-End Speech Recognition Toolkit},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/alibaba-damo-academy/FunASR/}},
}