Closed elpimous closed 7 years ago
The Voice Activity Detector is not 100% accurate, it will record silences too. You can increase the VAD level at https://github.com/respeaker/respeaker_python_library/blob/master/respeaker/vad.py#L25
The level is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.
hi, xiongyihui, and all: small mods on your 'bing_stt_with_vad.py' did the job Here we can record voice only, even if there is a lot of silence in front of record. And the record stops just after end voice. (doesn't seems to dynamically change environment noise, TODO)
import webrtcvad
import collections
import contextlib
import sys
import time
import signal
import pyaudio
import wave
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
SAMPWIDTH = 2
CHUNK_DURATION_MS = 30 # supports 10, 20 and 30 (ms)
PADDING_DURATION_MS = 1000
CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)
CHUNK_BYTES = CHUNK_SIZE * 2
NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS)
NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)
vad = webrtcvad.Vad(2) # (0 to 3) --> 0 is the less aggressive
audiopath = '/home/nvidia/Documents/respeaker.wav' # path to recorded audio file
pa = pyaudio.PyAudio()
stream = pa.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
start=False,
# input_device_index=2,
frames_per_buffer=CHUNK_SIZE)
got_a_sentence = False
leave = False
def write_wavefile(data):
with contextlib.closing(wave.open(audiopath, 'wb')) as audio:
audio.setframerate(RATE)
audio.setnchannels(CHANNELS)
audio.setsampwidth(SAMPWIDTH)
audio.writeframes(data)
def handle_int(sig, chunk):
global leave, got_a_sentence
leave = True
got_a_sentence = True
signal.signal(signal.SIGINT, handle_int)
while not leave:
ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS)
triggered = False
voiced_frames = []
ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS
ring_buffer_index = 0
buffer_in = ''
print("* recording")
stream.start_stream()
while not got_a_sentence and not leave:
chunk = stream.read(CHUNK_SIZE)
active = vad.is_speech(chunk, RATE)
sys.stdout.write('1' if active else '0')
ring_buffer_flags[ring_buffer_index] = 1 if active else 0
ring_buffer_index += 1
ring_buffer_index %= NUM_WINDOW_CHUNKS
if not triggered:
ring_buffer.append(chunk)
num_voiced = sum(ring_buffer_flags)
if num_voiced > 0.5 * NUM_WINDOW_CHUNKS:
sys.stdout.write('+')
triggered = True
voiced_frames.extend(ring_buffer)
ring_buffer.clear()
else:
voiced_frames.append(chunk)
ring_buffer.append(chunk)
num_unvoiced = NUM_WINDOW_CHUNKS - sum(ring_buffer_flags)
if num_unvoiced > 0.9 * NUM_WINDOW_CHUNKS:
sys.stdout.write('-')
triggered = False
got_a_sentence = True
sys.stdout.flush()
sys.stdout.write('\n')
data = b''.join(voiced_frames)
print("finished recording")
write_wavefile(data) # send finished audio data to writing function
stream.stop_stream()
print('ready for inference')
time.sleep(3)
got_a_sentence = False
stream.close()
hi, all
here is my code :
But it records silences too. How to record only voice activity ?
Thanks all