Closed icecoins closed 6 months ago
Hi, I try to copy the code from RVC to implement fcpe, and it seems that voice-changer is working properly.
But I don't know how to optimize the code, and the simple merged code may encounter bugs. The modified files and code are below ( I don't know which file is crucial, so I modified all of them directly ) :
voice-changer-master\client\demo\dist\assets\gui_settings\GUI.json
{
......
{
"name": "configArea",
"options": {
"detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe"],
"inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384]
}
}
......
voice-changer-master\client\lib\src\const.ts
......
export const F0Detector = {
dio: "dio",
harvest: "harvest",
crepe: "crepe",
crepe_full: "crepe_full",
crepe_tiny: "crepe_tiny",
rmvpe: "rmvpe",
rmvpe_onnx: "rmvpe_onnx",
fcpe: "fcpe",
} as const;
......
voice-changer-master\server\const.py
......
PitchExtractorType: TypeAlias = Literal[
"harvest",
"dio",
"crepe",
"crepe_full",
"crepe_tiny",
"rmvpe",
"rmvpe_onnx",
"fcpe",
]
......
voice-changer-master\server\requirements.txt
......
torchfcpe
......
voice-changer-master\server\voice_changer\RVC\pitchExtractor\FcpePitchExtractor.py
import numpy as np
from const import PitchExtractorType
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
import torchfcpe
class FcpePitchExtractor(PitchExtractor):
def __init__(self, gpu: int):
super().__init__()
self.pitchExtractorType: PitchExtractorType = "fcpe"
self.device = DeviceManager.get_instance().getDevice(gpu)
self.fcpe = torchfcpe.spawn_bundled_infer_model(self.device)
# I merge the code of Voice-Changer-CrepePitchExtractor and RVC-fcpe-infer, sry I don't know how to optimize the function.
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = self.fcpe.infer(
audio.to(self.device).unsqueeze(0).float(),
sr=16000,
decoder_mode="local_argmax",
threshold=0.006,
)
f0 = f0.squeeze()
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf
voice-changer-master\server\voice_changer\RVC\pitchExtractor\PitchExtractorManager.py
......
from voice_changer.RVC.pitchExtractor.FcpePitchExtractor import FcpePitchExtractor
......
@classmethod
def loadPitchExtractor(
cls, pitchExtractorType: PitchExtractorType, gpu: int
) -> PitchExtractor:
if pitchExtractorType == "harvest":
return HarvestPitchExtractor()
elif pitchExtractorType == "dio":
return DioPitchExtractor()
elif pitchExtractorType == "crepe":
return CrepePitchExtractor(gpu)
elif pitchExtractorType == "crepe_tiny":
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
elif pitchExtractorType == "crepe_full":
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
elif pitchExtractorType == "rmvpe":
return RMVPEPitchExtractor(cls.params.rmvpe, gpu)
elif pitchExtractorType == "rmvpe_onnx":
return RMVPEOnnxPitchExtractor(cls.params.rmvpe_onnx, gpu)
elif pitchExtractorType == "fcpe":
# add the FcpePitchExtractor
return FcpePitchExtractor(gpu)
else:
# return hubert as default
print("[Voice Changer] PitchExctractor not found", pitchExtractorType)
print(" fallback to dio")
return DioPitchExtractor()
I hope you can correct my code and implement fcpe in RVC one day, thanks for your attention !
In a few words, describe your idea
My idea is to implement fcpe
More information
Some users said that the fcpe in RVC perform better than rmvpe, the delay can be lower.
Will voice-changer support fcpe in RVC ?
IMG from https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/README.md