respeaker / respeaker_python_library

To build voice enabled objects/applications with Python and ReSpeaker
Apache License 2.0
149 stars 74 forks source link

How to record voice, without wakeup word ? #28

Closed elpimous closed 6 years ago

elpimous commented 6 years ago

hi, all

here is my code :

def task():
            mic = Microphone()
            data = mic.listen()
            data = b''.join(data)
            record(data)

def record(data):
    f = wave.open('/home/nvidia/catkin_ws/src/utils/src/wav.wav', 'wb')
    f.setframerate(16000)
    f.setsampwidth(2)
    f.setnchannels(1)
    f.writeframes(data)
    f.close()

But it records silences too. How to record only voice activity ?

Thanks all

xiongyihui commented 6 years ago

The Voice Activity Detector is not 100% accurate, it will record silences too. You can increase the VAD level at https://github.com/respeaker/respeaker_python_library/blob/master/respeaker/vad.py#L25

The level is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.

elpimous commented 6 years ago

hi, xiongyihui, and all: small mods on your 'bing_stt_with_vad.py' did the job Here we can record voice only, even if there is a lot of silence in front of record. And the record stops just after end voice. (doesn't seems to dynamically change environment noise, TODO)

import webrtcvad
import collections
import contextlib
import sys
import time
import signal
import pyaudio
import wave

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
SAMPWIDTH = 2
CHUNK_DURATION_MS = 30  # supports 10, 20 and 30 (ms)
PADDING_DURATION_MS = 1000
CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)
CHUNK_BYTES = CHUNK_SIZE * 2
NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS)
NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)

vad = webrtcvad.Vad(2) # (0 to 3)  -->  0 is the less aggressive
audiopath = '/home/nvidia/Documents/respeaker.wav' # path to recorded audio file

pa = pyaudio.PyAudio()
stream = pa.open(format=FORMAT,
                           channels=CHANNELS,
                           rate=RATE,
                           input=True,
                           start=False,
                           # input_device_index=2,
                           frames_per_buffer=CHUNK_SIZE)

got_a_sentence = False
leave = False

def write_wavefile(data):
    with contextlib.closing(wave.open(audiopath, 'wb')) as audio:
        audio.setframerate(RATE)
        audio.setnchannels(CHANNELS)
        audio.setsampwidth(SAMPWIDTH)
        audio.writeframes(data)

def handle_int(sig, chunk):
    global leave, got_a_sentence

    leave = True
    got_a_sentence = True

signal.signal(signal.SIGINT, handle_int)

while not leave:
    ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS)
    triggered = False
    voiced_frames = []
    ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS
    ring_buffer_index = 0
    buffer_in = ''

    print("* recording")
    stream.start_stream()
    while not got_a_sentence and not leave:
        chunk = stream.read(CHUNK_SIZE)
        active = vad.is_speech(chunk, RATE)
        sys.stdout.write('1' if active else '0')
        ring_buffer_flags[ring_buffer_index] = 1 if active else 0
        ring_buffer_index += 1
        ring_buffer_index %= NUM_WINDOW_CHUNKS
        if not triggered:
            ring_buffer.append(chunk)
            num_voiced = sum(ring_buffer_flags)
            if num_voiced > 0.5 * NUM_WINDOW_CHUNKS:
                sys.stdout.write('+')
                triggered = True
                voiced_frames.extend(ring_buffer)
                ring_buffer.clear()
        else:
            voiced_frames.append(chunk)
            ring_buffer.append(chunk)
            num_unvoiced = NUM_WINDOW_CHUNKS - sum(ring_buffer_flags)
            if num_unvoiced > 0.9 * NUM_WINDOW_CHUNKS:
                sys.stdout.write('-')
                triggered = False
                got_a_sentence = True

        sys.stdout.flush()

    sys.stdout.write('\n')

    data = b''.join(voiced_frames)
    print("finished recording")

    write_wavefile(data) # send finished audio data to writing function

    stream.stop_stream()

    print('ready for inference')
    time.sleep(3)

    got_a_sentence = False

stream.close()