Open PoubelleRigolotte opened 1 month ago
By design, all audio data from every user is written to the root sink. This means that you need to use a Filter sink, use a custom sink with your own logic, or use a MultiAudioSink to route the data to individual sinks. Unfortunately it seems that the way I wrote MultiAudioSink isn't particularly useful for this use case, but its not hard to handle.
What I suggest you should do is make a sink that acts like a defaultdict. You give it a class or factory function and it creates new sinks when it needs to (when it gets data from a user it doesn't have a sink for). I will probably add this at some point as well.
If your goal is to record the whole group chat as is (and keeping silences, speaker timings, etc.), I can provide you my code:
import io
import time
import discord
from discord.ext.voice_recv import AudioSink, VoiceData, WaveSink
from discord.ext.voice_recv.silence import SilenceGenerator
from typing import Dict, Optional
from pydub import AudioSegment
import numpy as np
import wave
discord.opus._load_default() # mandatory for those who wonder
def add_silence_to_wav(input_data: bytes, silence_duration: float) -> bytes:
audio = AudioSegment.from_wav(io.BytesIO(input_data))
silence = AudioSegment.silent(duration=int(silence_duration * 1000)) # pydub uses milliseconds
final_audio = silence + audio
output_buffer = io.BytesIO()
final_audio.export(output_buffer, format="wav")
return output_buffer.getvalue()
class MultiAudioImprovedWithSilenceSink(AudioSink):
def __init__(self):
super().__init__()
self.user_sinks: Dict[int, WaveSink] = {}
self.user_buffers: Dict[int, io.BytesIO] = {}
self.silence_generators: Dict[int, SilenceGenerator] = {}
self.start_time = time.perf_counter_ns()
self.first_packet_time: Dict[int, int] = {}
def _get_or_create_sink(self, user_id: int) -> WaveSink:
if user_id not in self.user_sinks:
buffer = io.BytesIO()
sink = WaveSink(buffer)
self.user_sinks[user_id] = sink
self.user_buffers[user_id] = buffer
self.silence_generators[user_id] = SilenceGenerator(sink.write)
self.silence_generators[user_id].start()
return self.user_sinks[user_id]
def wants_opus(self) -> bool:
return False
def write(self, user: Optional[discord.User], data: VoiceData) -> None:
if user is None:
return
sink = self._get_or_create_sink(user.id)
silence_gen = self.silence_generators[user.id]
if user.id not in self.first_packet_time:
self.first_packet_time[user.id] = time.perf_counter_ns()
silence_gen.push(user, data.packet)
sink.write(user, data)
def cleanup(self) -> None:
for silence_gen in self.silence_generators.values():
silence_gen.stop()
self.user_sinks.clear()
self.user_buffers.clear()
self.silence_generators.clear()
def get_user_audio(self, user_id: int) -> Optional[bytes]:
if user_id in self.user_buffers:
buffer = self.user_buffers[user_id]
buffer.seek(0)
audio_data = buffer.read()
return audio_data
return None
def get_initial_silence_duration(self, user_id: int) -> float:
if user_id in self.first_packet_time:
return (self.first_packet_time[user_id] - self.start_time) / 1e9 # nano to sec
return 0.0
def mix_audio(self, audio_data_dict: Dict[int, bytes]) -> Optional[bytes]:
audio_arrays = []
sample_rate = 0
num_channels = 0
sample_width = 0
for audio_data in audio_data_dict.values():
if len(audio_data) <= 44:
continue
with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
params = wav_file.getparams()
sample_rate = params.framerate
num_channels = params.nchannels
sample_width = params.sampwidth
frames = wav_file.readframes(params.nframes)
audio_array = np.frombuffer(frames, dtype=np.int16)
audio_arrays.append(audio_array)
if not audio_arrays:
return None
max_length = max(len(arr) for arr in audio_arrays)
padded_audio_arrays = [np.pad(arr, (0, max_length - len(arr)), 'constant') for arr in audio_arrays]
mixed_audio = np.mean(padded_audio_arrays, axis=0).astype(np.int16)
output_buffer = io.BytesIO()
with wave.open(output_buffer, 'wb') as output_wav:
output_wav.setnchannels(num_channels)
output_wav.setsampwidth(sample_width)
output_wav.setframerate(sample_rate)
output_wav.writeframes(mixed_audio.tobytes())
output_buffer.seek(0)
return output_buffer.read()
You can these use it like so:
...
self.custom_sink = MultiAudioImprovedWithSilenceSink()
voice_client.listen(self.custom_sink)
...
voice_client.stop_listening()
all_audio_data = {}
for member in voice_channel.members:
if member.bot:
continue
audio_data = self.custom_sink.get_user_audio(member.id)
if audio_data and len(audio_data) > 44:
silence_duration = self.custom_sink.get_initial_silence_duration(member.id)
final_audio_data = add_silence_to_wav(audio_data, silence_duration)
all_audio_data[member.id] = final_audio_data
if all_audio_data:
combined_audio = self.custom_sink.mix_audio(all_audio_data)
if combined_audio:
combined_file_path = f"{record_path}/combined_{datetime.now().strftime('%Y-%m-%d%H-%M-%S')}.wav"
with open(combined_file_path, 'wb') as f:
f.write(combined_audio)
Note that I used a custom function add_silence_to_wav
to add the silence at the beginning of the recording, else recording wouldn't have been synced and the combined audio would sound disastrous
Hope this helps!
Hi,
I'm getting a bug where when 2 people speak at the same time, the packets become glitchy and the audio is not understable when both are talking. If they aren't speaking at the same time, audio is clean.
Here is my code: