Open chigkim opened 1 year ago
I just separate code for train index in new script (my forked repo). And use it in Kaggle notebook without web-ui. May be some later I will make pull request to original repo.
So you can try to reuse my script: https://github.com/elcolex777/Retrieval-based-Voice-Conversion-WebUI/blob/main/train_index_print.py
I have also upload my kaggle notebook which I use. It’s worked, I think) https://github.com/elcolex777/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI_v2_kaggle.ipynb
inference: refer to https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer_batch_rvc.py or https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/myinfer-v2-0528.py (you may need to modify something)
train the index: refer to https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L1023 they don't use commands so... maybe you need to learn python or find someone to help you write the command line version
I just made the change for new vc models (works also for olders but not really adapted) I submitted the contribution to hugging face here is the file :
# runtime\python.exe myinfer.py 0 "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\vocal.wav" "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\logs\Hagrid.index" harvest "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\test.wav" "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\weights\HagridFR.pth" 0.6 cuda:0 True 5 44100 44100 1.0 1.0 True
import os,sys,pdb,torch
now_dir = os.getcwd()
sys.path.append(now_dir)
import argparse
import glob
import sys
import torch
from multiprocessing import cpu_count
class Config:
def __init__(self,device,is_half):
self.device = device
self.is_half = is_half
self.n_cpu = 0
self.gpu_name = None
self.gpu_mem = None
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
def device_config(self) -> tuple:
if torch.cuda.is_available():
i_device = int(self.device.split(":")[-1])
self.gpu_name = torch.cuda.get_device_name(i_device)
if (
("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
or "P40" in self.gpu_name.upper()
or "1060" in self.gpu_name
or "1070" in self.gpu_name
or "1080" in self.gpu_name
):
print("16系/10系显卡和P40强制单精度")
self.is_half = False
for config_file in ["32k.json", "40k.json", "48k.json"]:
with open(f"configs/{config_file}", "r") as f:
strr = f.read().replace("true", "false")
with open(f"configs/{config_file}", "w") as f:
f.write(strr)
with open("trainset_preprocess_pipeline_print.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("trainset_preprocess_pipeline_print.py", "w") as f:
f.write(strr)
else:
self.gpu_name = None
self.gpu_mem = int(
torch.cuda.get_device_properties(i_device).total_memory
/ 1024
/ 1024
/ 1024
+ 0.4
)
if self.gpu_mem <= 4:
with open("trainset_preprocess_pipeline_print.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("trainset_preprocess_pipeline_print.py", "w") as f:
f.write(strr)
elif torch.backends.mps.is_available():
print("没有发现支持的N卡, 使用MPS进行推理")
self.device = "mps"
else:
print("没有发现支持的N卡, 使用CPU进行推理")
self.device = "cpu"
self.is_half = True
if self.n_cpu == 0:
self.n_cpu = cpu_count()
if self.is_half:
# 6G显存配置
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
else:
# 5G显存配置
x_pad = 1
x_query = 6
x_center = 38
x_max = 41
if self.gpu_mem != None and self.gpu_mem <= 4:
x_pad = 1
x_query = 5
x_center = 30
x_max = 32
return x_pad, x_query, x_center, x_max
f0up_key=sys.argv[1]
input_path = sys.argv[2]
index_path = sys.argv[3]
f0method = sys.argv[4]
opt_path = sys.argv[5]
model_path = sys.argv[6]
index_rate = float(sys.argv[7])
device = sys.argv[8]
is_half = bool(sys.argv[9])
filter_radius = int(sys.argv[10])
tgt_sr = int(sys.argv[11])
resample_sr = int(sys.argv[12])
rms_mix_rate = float(sys.argv[13])
version = sys.argv[14]
protect = sys.argv[15].lower() == 'false'
print(sys.argv)
config=Config(device,is_half)
now_dir=os.getcwd()
sys.path.append(now_dir)
from vc_infer_pipeline import VC
from lib.infer_pack.models import SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono
from lib.audio import load_audio
from fairseq import checkpoint_utils
from scipy.io import wavfile
hubert_model=None
def load_hubert():
global hubert_model
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
if(is_half):hubert_model = hubert_model.half()
else:hubert_model = hubert_model.float()
hubert_model.eval()
def vc_single(sid, input_audio, f0_up_key, f0_file, f0_method, file_index, index_rate, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect):
global net_g, vc, hubert_model
if input_audio is None:return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
audio=load_audio(input_audio,16000)
times = [0, 0, 0]
if(hubert_model==None):load_hubert()
if_f0 = cpt.get("f0", 1)
# audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file)
audio_opt = vc.pipeline(hubert_model, net_g, sid, audio, input_path, times, f0_up_key, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=f0_file)
print(times)
return audio_opt
def get_vc(model_path):
global n_spk,tgt_sr,net_g,vc,cpt,device,is_half
print("loading pth %s"%model_path)
cpt = torch.load(model_path, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
if_f0=cpt.get("f0",1)
if(if_f0==1):
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩
net_g.eval().to(device)
if (is_half):net_g = net_g.half()
else:net_g = net_g.float()
vc = VC(tgt_sr, config)
n_spk=cpt["config"][-3]
# return {"visible": True,"maximum": n_spk, "__type__": "update"}
get_vc(model_path)
wav_opt = vc_single(0, input_path, f0up_key, None, f0method, index_path, index_rate, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect)
wavfile.write(opt_path, tgt_sr, wav_opt)
And here an update of the README.md english to use the file :
## Using the script without a graphical interface (via command line)
For users who prefer a non-GUI approach, you can directly execute the script via the command line.
### Base command:
runtime\python.exe myinfer.py 0 "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\vocal.wav" "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\logs\Hagrid.index" harvest "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\test.wav" "C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\weights\HagridFR.pth" 0.6 cuda:0 True 5 44100 44100 1.0 1.0 True
### Explanation of arguments:
1. **Target voice number**: `0` (in this example)
2. **Path to the input audio file**: `"C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\vocal.wav"`
3. **Path to the index file**: `"C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\logs\Hagrid.index"`
4. **Method for pitch (F0) extraction**: `harvest` (in this example)
5. **Output path for the processed audio file**: `"C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\INPUTS_VOCAL\test.wav"`
6. **Path to the model**: `"C:\ YOUR PATH FOR THE ROOT (RVC0813Nvidia)\weights\HagridFR.pth"`
7. **Index rate**: `0.6` (in this example)
8. **Device for execution (GPU/CPU)**: `cuda:0` for an NVIDIA card, for example.
9. **Copyright protection (True/False)**.
10. **Filter radius**: `5` (in this example)
11. **Target sampling rate**: `44100` (in this example)
12. **Resampling rate**: `44100` (in this example)
13. **RMS mixing rate**: `1.0` (in this example)
14. **Version**: `1.0` (in this example)
15. **Protection**: `True` (in this example)
Ensure you replace paths with the ones relevant to your setup and adjust other parameters as per your requirements.
If you click the button that goes through all the steps, it prints out what command it's executing.
However, it doesn't print out the command for training index.
Same thing, it does not print out the command for inferencing.
Can we have those commands to be printed in both terminal as well as in WebUI?
I haven't checked ckpt processing tab, but it will be great if they print out the commands as well if they don't already.
It's extremely useful to be able to get those commands and run without WebUI.