let speech_rev = if apply_speech {
let speech_rms = rms(speech.iter());
// self.pad(speech, pad_front, pad_back)?; // Pad since STFT will truncate at the end
let mut speech_rev = speech.clone();
self.convolve(
&mut speech_rev,
rir_noise.clone(),
&mut fft_t,
Some(orig_len),
)?;
...
let speech_rms_after = rms(speech.iter());
*speech *= speech_rms / (speech_rms_after + 1e-10);
}
the output speech is scaled to the rms of input speech, it may lower than the rms of speech_rev. this may lead to loudness mismatch of clean and noisy signal in training.
I think it should be change as follows codes
let speech_rev = if apply_speech {
// let speech_rms = rms(speech.iter());
// self.pad(speech, pad_front, pad_back)?; // Pad since STFT will truncate at the end
let mut speech_rev = speech.clone();
self.convolve(
&mut speech_rev,
rir_noise.clone(),
&mut fft_t,
Some(orig_len),
)?;
let speech_rms = rms(speech_rev.iter());
...
let speech_rms_after = rms(speech.iter());
*speech *= speech_rms / (speech_rms_after + 1e-10);
}
in libDF/src/augmentations.rs
the output speech is scaled to the rms of input speech, it may lower than the rms of speech_rev. this may lead to loudness mismatch of clean and noisy signal in training. I think it should be change as follows codes