KevinWang676 / Bark-Voice-Cloning

Bark Voice Cloning and Voice Cloning for Chinese Speech
MIT License
2.59k stars 369 forks source link

IndexError: index 0 is out of bounds for axis 0 with size 0 #51

Open lisanhan opened 9 months ago

lisanhan commented 9 months ago

While using my own training voice data, got the above error at below

ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7")

More details:

46%|████▌ | 139/305 [00:02<00:02, 60.32it/s]

IndexError Traceback (most recent call last) Cell In[8], line 4 1 input_wav = "./test_wavs/" 2 output_data = "./output_training_data/" ----> 4 ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7")

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/modelscope/tools/speech_tts_autolabel.py:78, in run_auto_label(input_wav, work_dir, para_ids, resource_model_id, resource_revision, gender, stage, process_num, develop_mode, has_para, enable_enh) 64 model_resource = _download_and_unzip_resource(resource_model_id, 65 resource_revision) 66 auto_labeling = AutoLabeling( 67 os.path.abspath(input_wav), 68 model_resource, (...) 76 process_num, 77 enable_enh=enable_enh) ---> 78 ret_code, report = auto_labeling.run() 79 return ret_code, report

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/auto_label.py:853, in AutoLabeling.run(self) 851 if self.enable_vad: 852 logging.info("[VAD] chunk recordings for training.") --> 853 self.wav_cut_by_vad(self.resample_wav_dir, self.cut_wav_dir) 854 else: 855 self.cut_wav_dir = self.resample_wav_dir

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/auto_label.py:437, in AutoLabeling.wav_cut_by_vad(self, input_wav_dir, output_wav_dir) 435 shutil.rmtree(output_wav_dir) 436 os.makedirs(output_wav_dir, exist_ok=True) --> 437 vad_cut(input_wav_dir, output_wav_dir, self.resource_dir)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audiocut/vad.py:367, in vad_cut(input_wav_dir, output_wav_dir, resource_dir, superhigh_cut_threshold, high_cut_threshold, low_cut_threshold, max_dur_threshold, min_dur_threshold) 365 audio_files = glob.glob(os.path.join(input_wav_dir, ".wav")) 366 for audio_file in tqdm(audio_files): --> 367 vad_level_S( 368 vad_pipeline_superhigh, 369 audio_file, 370 output_wav_dir, 371 output_wav_dirs["S"], 372 max_samples_threshold, 373 min_samples_threshold, 374 ) 376 audio_files = glob.glob(os.path.join(output_wav_dirs["S"], ".wav")) 377 # if len(audio_files) <= 0: 378 # return

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audiocut/vad.py:45, in vad_level_S(vad_pipeline, audio_file, output_wav_dir, tmp_wav_dir, max_samples_threshold, min_samples_threshold) 42 scale_factor = sample_rate / 16000 44 wavid_origin = os.path.basename(audio_file)[:-4] ---> 45 segments_result_origin = vad_pipeline(audio_in=waveform_16k) 46 segments_text_origin = segments_result_origin[0] 47 if len(segments_text_origin) == 0:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audio2phone/funasr_onnx/vad_bin.py:102, in Fsmn_vad.call(self, audio_in, **kwargs) 100 end_idx = min(waveform_nums, beg_idx + self.batch_size) 101 waveform = waveform_list[beg_idx:end_idx] --> 102 feats, feats_len = self.extract_feat(waveform) 103 waveform = np.array(waveform) 104 param_dict = kwargs.get("param_dict", dict())

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audio2phone/funasr_onnx/vad_bin.py:176, in Fsmn_vad.extract_feat(self, waveform_list) 174 for waveform in waveformlist: 175 speech, = self.frontend.fbank(waveform) --> 176 feat, feat_len = self.frontend.lfr_cmvn(speech) 177 feats.append(feat) 178 feats_len.append(feat_len)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/frontend.py:87, in WavFrontend.lfr_cmvn(self, feat) 85 def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 86 if self.lfr_m != 1 or self.lfr_n != 1: ---> 87 feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n) 89 if self.cmvn_file: 90 feat = self.apply_cmvn(feat)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/frontend.py:101, in WavFrontend.apply_lfr(inputs, lfr_m, lfr_n) 99 T = inputs.shape[0] 100 T_lfr = int(np.ceil(T / lfr_n)) --> 101 left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1)) 102 inputs = np.vstack((left_padding, inputs)) 103 T = T + (lfr_m - 1) // 2

IndexError: index 0 is out of bounds for axis 0 with size 0

KevinWang676 commented 8 months ago

You may try it again and make sure your input audio is about 1 minute long.