Open takan1 opened 6 months ago
Set parameter use_microphone to false on the recorder cloud instance.
Record a 16000 Hz mono 16 bit input stream from your local microphone:
audio_interface = pyaudio.PyAudio()
stream = audio_interface.open(
rate=16000,
format=pyaudio.paInt16,
channels=1,
input=True,
)
Provide the recorded raw PCM audiochunks with this method to the recorder:
recorder.feed_audio(audio_chunk)
@KoljaB Thanks for the comment! Following your advice, here is my current code (mostly taken from the test folder) but I still cannot connect to the local mic to the server. The error is around not being able to find the input. Is there something I'm missing?
from RealtimeSTT import AudioToTextRecorder
from colorama import Fore, Back, Style
import colorama
import os
import pyaudio
if __name__ == '__main__':
print("Initializing RealtimeSTT test...")
colorama.init()
full_sentences = []
displayed_text = ""
def clear_console():
os.system('clear' if os.name == 'posix' else 'cls')
def text_detected(text):
global displayed_text
sentences_with_style = [
f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
for i, sentence in enumerate(full_sentences)
]
new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
if new_text != displayed_text:
displayed_text = new_text
clear_console()
print(displayed_text, end="", flush=True)
def process_text(text):
full_sentences.append(text)
text_detected("")
recorder_config = {
'use_microphone': False, # Do not use the microphone directly
'spinner': False,
'model': 'tiny',
'language': 'en',
'silero_sensitivity': 0.4,
'webrtc_sensitivity': 2,
'post_speech_silence_duration': 0.4,
'min_length_of_recording': 0,
'min_gap_between_recordings': 0,
'enable_realtime_transcription': True,
'realtime_processing_pause': 0.2,
'realtime_model_type': 'tiny',
'on_realtime_transcription_update': text_detected,
}
recorder = AudioToTextRecorder(**recorder_config)
audio_interface = pyaudio.PyAudio()
stream = audio_interface.open(
rate=16000,
format=pyaudio.paInt16,
channels=1,
input=True,
)
clear_console()
print("Say something...", end="", flush=True)
try:
while True:
audio_chunk = stream.read(1024, exception_on_overflow=False)
recorder.feed_audio(audio_chunk)
except KeyboardInterrupt:
stream.stop_stream()
stream.close()
audio_interface.terminate()
print("\nRecording stopped.")
@KoljaB Oh is your comment meant for batch process? If so how can I real time transcribe with this path?
You feed chunks which looks good so far but you don't call recorder.text(), so nothing would get processed. You also need to feed chunks in one thread and call recorder.text() from another (because recorder.text() is a blocking call and you can't do both continuously feeding chunks and waiting for text in the same thread).
"The error is around not being able to find the input." What logs this error, PyAudio or RealtimeSTT (which it should not with use_microphone=False)? What exactly is logged?
@KoljaB First, I would like to express my sincere thanks for providing such an excellent and effective project. I am currently facing an issue that is somewhat similar to the one described. When I attempted to run text() and recorder.feed_audio(chunk) in two separate threads, I am not certain whether the problem lies in my code, but it seems that recorder.feed_audio(chunk) in the other thread is not correctly feeding the chunk into the recorder through the feed_audio method.
Could you kindly provide an example of how to use multithreading in this context?
Once again, thank you for your valuable contributions to the community.
While you're waiting for KoljaB to reply, a more blunt approach would be to simply stream the mic from the local computer to the server and then use the realtimestt_test_stereomix.py
script (which transcribes all system audio).
Edit: You can see it in action here.
Simple example:
if __name__ == "__main__":
import threading
import pyaudio
from RealtimeSTT import AudioToTextRecorder
# Audio recording parameters
CHUNK = 1024 # Number of frames per buffer
FORMAT = pyaudio.paInt16 # 16-bit integer format
CHANNELS = 1 # Mono audio
RATE = 16000 # Sample rate in Hz (standard for speech recognition models)
# Initialize the recorder with use_microphone=False
recorder = AudioToTextRecorder(
model='medium.en',
language='en',
use_microphone=False, # We'll feed audio ourselves
spinner=False, # Disable spinner display
beam_size=5, # Beam size for transcription accuracy
# Add other parameters as needed
)
# Event to signal threads to stop gracefully
stop_event = threading.Event()
def feed_audio_thread():
p = pyaudio.PyAudio()
# Open the audio stream
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK
)
try:
while not stop_event.is_set():
data = stream.read(CHUNK)
recorder.feed_audio(data)
except Exception as e:
print(f"feed_audio_thread encountered an error: {e}")
finally:
stream.stop_stream()
stream.close()
p.terminate()
print("Audio stream closed.")
def transcription_thread():
def process_text(full_sentence):
print("Transcribed text:", full_sentence)
# Optionally, you can set a condition to stop the threads
if "stop recording" in full_sentence.lower():
print("Stop command detected. Stopping threads...")
stop_event.set()
try:
while not stop_event.is_set():
recorder.text(process_text)
except Exception as e:
print(f"transcription_thread encountered an error: {e}")
finally:
print("Transcription thread exiting.")
# Start the feed_audio_thread
audio_thread = threading.Thread(target=feed_audio_thread)
audio_thread.start()
# Start the transcription_thread
transcription_thread = threading.Thread(target=transcription_thread)
transcription_thread.start()
# Wait for both threads to finish
audio_thread.join()
transcription_thread.join()
print("Recording and transcription have stopped.")
简单示例:
if __name__ == "__main__": import threading import pyaudio from RealtimeSTT import AudioToTextRecorder # Audio recording parameters CHUNK = 1024 # Number of frames per buffer FORMAT = pyaudio.paInt16 # 16-bit integer format CHANNELS = 1 # Mono audio RATE = 16000 # Sample rate in Hz (standard for speech recognition models) # Initialize the recorder with use_microphone=False recorder = AudioToTextRecorder( model='medium.en', language='en', use_microphone=False, # We'll feed audio ourselves spinner=False, # Disable spinner display beam_size=5, # Beam size for transcription accuracy # Add other parameters as needed ) # Event to signal threads to stop gracefully stop_event = threading.Event() def feed_audio_thread(): p = pyaudio.PyAudio() # Open the audio stream stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK ) try: while not stop_event.is_set(): data = stream.read(CHUNK) recorder.feed_audio(data) except Exception as e: print(f"feed_audio_thread encountered an error: {e}") finally: stream.stop_stream() stream.close() p.terminate() print("Audio stream closed.") def transcription_thread(): def process_text(full_sentence): print("Transcribed text:", full_sentence) # Optionally, you can set a condition to stop the threads if "stop recording" in full_sentence.lower(): print("Stop command detected. Stopping threads...") stop_event.set() try: while not stop_event.is_set(): recorder.text(process_text) except Exception as e: print(f"transcription_thread encountered an error: {e}") finally: print("Transcription thread exiting.") # Start the feed_audio_thread audio_thread = threading.Thread(target=feed_audio_thread) audio_thread.start() # Start the transcription_thread transcription_thread = threading.Thread(target=transcription_thread) transcription_thread.start() # Wait for both threads to finish audio_thread.join() transcription_thread.join() print("Recording and transcription have stopped.")
Thank you so much for your prompt response! I will refer to the example you provided as I continue to revise my server-side code.
I truly appreciate your time and assistance. Wishing you continued success with your work, and thank you once again for your invaluable support.
Hi I'd like to run RealtimeSTT with a cloud gpu and the gpu is connected to my VScode on my local via ssh. How can I use my local mic as input?