KevinWang676 / Bark-Voice-Cloning

Bark Voice Cloning and Voice Cloning for Chinese Speech
MIT License
2.8k stars 402 forks source link

auto label error #7

Open waltcow opened 1 year ago

waltcow commented 1 year ago

尝试在本地跑 Voice_Cloning_for_Chinese_Speech.ipynb

input_wav = "./test_wavs/"
output_data = "./output_training_data/"

ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.5")
2023-07-10 10:25:48,985 - modelscope - INFO - Use user-specified model revision: v1.0.5
2023-07-10:10:25:48, INFO [] Use user-specified model revision: v1.0.5
2023-07-10:10:25:56, INFO [] ---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody...](  ---
2023-07-10:10:25:56, INFO [] ---  OK  ---
2023-07-10:10:25:56, INFO [] ---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/sp_interval...](  ---
2023-07-10:10:25:56, INFO [] ---  OK  ---
2023-07-10:10:25:56, INFO [] ---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/wav...](  ---
2023-07-10:10:25:56, INFO [] ---  OK  ---
2023-07-10:10:25:56, INFO [] ---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/log...](  ---
2023-07-10:10:25:56, INFO [] ---  OK  ---
2023-07-10:10:25:56, INFO [] 2023-07-10 10:25:56
2023-07-10:10:25:56, INFO [] wav_preprocess start...
  0%|          | 0/1 [00:00<?, ?it/s]sox WARN rate: rate clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
100%|██████████| 1/1 [00:00<00:00, 118.89it/s]
2023-07-10:10:25:56, INFO [] wav cut by vad start...
100%|██████████| 1/1 [00:00<00:00,  5.07it/s]
100%|██████████| 1/1 [00:00<00:00, 17.83it/s]
2023-07-10:10:26:00, INFO [] Text to label start...
festival_initialize() called more than once
100%|██████████| 1/1 [00:00<00:00,  6.40it/s]
2023-07-10:10:26:01, INFO [] pre-break recording in paragraph by vad.
2023-07-10:10:26:01, INFO [] Generate phone interval by asr align.
2023-07-10:10:26:01, INFO [] ---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/align...](  ---
2023-07-10:10:26:01, INFO [] ---  OK  ---
2023-07-10:10:26:01, INFO [] prosody_dir=/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody
2023-07-10:10:26:01, INFO [] job_num=1 process_num=4 fbank_config=/home/mai/.cache/modelscope/hub/damo/speech_ptts_autolabel_16k/model/fsmn_16k_2/fbank.conf, data_dir=/home/mai/Bark-Voice-Cloning/output_training_data/align/gen/data, fbank_dir=/home/mai/Bark-Voice-Cloning/output_training_data/align/gen/fbank
2023-07-10:10:26:01, INFO [] run make_fbank with num=1 config_path=/home/mai/.cache/modelscope/hub/damo/speech_ptts_autolabel_16k/model/fsmn_16k_2/fbank.conf
2023-07-10:10:26:01, INFO [] data_path=/home/mai/Bark-Voice-Cloning/output_training_data/align/gen/data fbank_path=/home/mai/Bark-Voice-Cloning/output_training_data/align/gen/fbank
2023-07-10:10:26:01, INFO [] [{'id': 'test_0_0', 'wav': '/home/mai/Bark-Voice-Cloning/output_training_data/wav_cut/test_0_0.wav'}]
run_asr_align step 2
  0%|          | 0/1 [00:00<?, ?it/s]
2023-07-10:10:26:01, INFO [] DONE compute fbank and copy feats
FileNotFoundError                         Traceback (most recent call last)
Cell In[11], line 4
      1 input_wav = "[./test_wavs/]("
      2 output_data = "[./output_training_data/]("
----> 4 ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.5")

File [~/.local/lib/python3.9/site-packages/modelscope/tools/](, in run_auto_label(input_wav, work_dir, para_ids, resource_model_id, resource_revision, gender, stage, process_num, develop_mode, has_para, enable_enh)
     64 model_resource = _download_and_unzip_resource(resource_model_id,
     65                                               resource_revision)
     66 auto_labeling = AutoLabeling(
     67     os.path.abspath(input_wav),
     68     model_resource,
     76     process_num,
     77     enable_enh=enable_enh)
---> 78 ret_code, report =
     79 return ret_code, report

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in
    785 # generate phone interval by asr align.
    786"Generate phone interval by asr align.")
--> 787 self.asr_align()
    789 # align interval leading and trailing silence with wav.
    790 self.trim_sil_wav_interval()

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in AutoLabeling.asr_align(self)
    480     run_asr_align(self.resource_dir, align_output, script_file, self.out_wav_peh_dir, job_num=self.break_job_num, process_num=self.process_num)
    481 else:
--> 482     run_asr_align(self.resource_dir, align_output, script_file, self.cut_wav_dir, job_num = self.align_job_num, process_num=self.process_num)
    483 # fbank feats files could be used by vad.
    484 self.asr_align_gen_feats_file = os.path.join(align_output, "data/feats.scp")

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in run_asr_align(resource_root, working_dir, speak_script, wave_dir, step, job_num, process_num)
    520 lm_dir = resource_root + '[/lang]('
    521 am_dir = resource_root + '[/fsmn_16k_2]('
--> 522 process(job_num, process_num, lm_dir, am_dir, working_dir, speak_script, wave_dir, engine_test_dir, engine_data_dir, sy2ph_map, step)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in process(job_num, process_num, lm_dir, am_dir, working_dir, speak_script, wave_dir, engine_test_dir, engine_data_dir, sy2ph_map, step)
    480 if not os.path.exists(fbank_dir):
    481     os.makedirs(fbank_dir)
--> 482 generate_fbank(job_num, process_num, data_dir, fbank_config, fbank_dir)
    484 if step >= ASR_ALIGN_STEP_ALIGN:
    485     #################### step6 ####################
    486     align_dir = os.path.join(working_dir, 'align')

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in generate_fbank(job_num, process_num, data_dir, fbank_config, fbank_dir)
    189 def generate_fbank(job_num, process_num, data_dir, fbank_config, fbank_dir):
    190'job_num={job_num} process_num={process_num} fbank_config={fbank_config}, data_dir={data_dir}, fbank_dir={fbank_dir}')
--> 191     do_make_fbank(job_num, process_num, fbank_config, data_dir, fbank_dir)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in do_make_fbank(num, process_num, config_path, data_path, fbank_path)
     80         id = fbank_list[i]['id']
     81         output_scp = os.path.join(fbank_path, f'raw_fbank_data.{id}.scp')
---> 82         with open(output_scp, 'r') as f2:
     83             feats_scp_f.write(f2.readline())

FileNotFoundError: [Errno 2] No such file or directory: '/home/mai/Bark-Voice-Cloning/output_training_data/align/gen/fbank/raw_fbank_data.test_0_0.scp'
KevinWang676 commented 1 year ago


waltcow commented 1 year ago


KevinWang676 commented 1 year ago


waltcow commented 1 year ago

折腾了一下午,感觉太难了 @KevinWang676

2023-07-20 08:38:16,233 - modelscope - INFO - Use user-specified model revision: v1.0.5
2023-07-20:08:38:16, INFO [] Use user-specified model revision: v1.0.5
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody...](  ---
---  OK  ---
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/sp_interval]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/sp_interval...](  ---
---  OK  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/wav...](  ---
---  OK  ---
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/log]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/log...](  ---
---  OK  ---
2023-07-20 08:38:23
wav_preprocess start...
---  There is this folder!  ---
  0%|          | 0/16 [00:00<?, ?it/s]sox WARN rate: rate clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
100%|██████████| 16/16 [00:00<00:00, 139.72it/s]
wav cut by vad start...

 12%|█▎        | 2/16 [00:00<00:01, 10.62it/s]
IndexError                                Traceback (most recent call last)
Cell In[8], line 4
      1 input_wav = "[./test_wavs/]("
      2 output_data = "[./output_training_data/]("
----> 4 ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.5")

File [~/.local/lib/python3.9/site-packages/modelscope/tools/](, in run_auto_label(input_wav, work_dir, para_ids, resource_model_id, resource_revision, gender, stage, process_num, develop_mode, has_para, enable_enh)
     63 model_resource = _download_and_unzip_resousrce(resource_model_id,
     64                                                resource_revision)
     65 auto_labeling = AutoLabeling(
     66     os.path.abspath(input_wav),
     67     model_resource,
     75     process_num,
     76     enable_enh=enable_enh)
---> 77 ret_code, report =
     78 return ret_code, report

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in
    762 self.wav_preprocess()
    764 ## cut wav by vad
--> 765 self.wav_cut_by_vad()
    767 # get prosody
    768 audio_path = glob.glob(os.path.join(self.cut_wav_dir, '*.wav'))

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in AutoLabeling.wav_cut_by_vad(self)
    369     shutil.rmtree(self.cut_wav_dir)
    370 os.makedirs(self.cut_wav_dir, exist_ok=True)
--> 371 vad_cut(self.resample_wav_dir, self.cut_wav_dir, self.resource_dir)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audiocut/](, in vad_cut(input_wav_dir, output_wav_dir, resource_dir, cut_threshold, start_sil_threshold, end_sil_threshold, max_dur_threshold, min_dur_threshold)
     69 min_samples_threshold = int(min_dur_threshold * sample_rate)
     71 wavid = os.path.basename(audio_in).split('.')[0]
---> 73 segments_result = vad_pipeline(audio_in=waveform) 
     74 segments_text = segments_result[0]
     76 if len(segments_text) == 0:

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/](, in Fsmn_vad.__call__(self, audio_in, **kwargs)
     92 end_idx = min(waveform_nums, beg_idx + self.batch_size)
     93 waveform = waveform_list[beg_idx:end_idx]
---> 94 feats, feats_len = self.extract_feat(waveform)
     95 waveform = np.array(waveform)
     96 param_dict = kwargs.get('param_dict', dict())

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/](, in Fsmn_vad.extract_feat(self, waveform_list)
    152 for waveform in waveform_list:
    153     speech, _ = self.frontend.fbank(waveform)
--> 154     feat, feat_len = self.frontend.lfr_cmvn(speech)
    155     feats.append(feat)
    156     feats_len.append(feat_len)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/](, in WavFrontend.lfr_cmvn(self, feat)
     87 def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     88     if self.lfr_m != 1 or self.lfr_n != 1:
---> 89         feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
     91     if self.cmvn_file:
     92         feat = self.apply_cmvn(feat)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/](, in WavFrontend.apply_lfr(inputs, lfr_m, lfr_n)
    101 T = inputs.shape[0]
    102 T_lfr = int(np.ceil(T [/]( lfr_n))
--> 103 left_padding = np.tile(inputs[0], ((lfr_m - 1) [/]([/]( 2, 1))
    104 inputs = np.vstack((left_padding, inputs))
    105 T = T + (lfr_m - 1) [/]([/]( 2

IndexError: index 0 is out of bounds for axis 0 with size 0
0i0i0i commented 1 year ago

折腾了一下午,感觉太难了 @KevinWang676

2023-07-20 08:38:16,233 - modelscope - INFO - Use user-specified model revision: v1.0.5
2023-07-20:08:38:16, INFO [] Use user-specified model revision: v1.0.5
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/paragraph/prosody...](  ---
---  OK  ---
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/sp_interval]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/sp_interval...](  ---
---  OK  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/wav...](  ---
---  OK  ---
--- Remove [/home/mai/Bark-Voice-Cloning/output_training_data/log]( folder!  ---
---  New folder [/home/mai/Bark-Voice-Cloning/output_training_data/log...](  ---
---  OK  ---
2023-07-20 08:38:23
wav_preprocess start...
---  There is this folder!  ---
  0%|          | 0/16 [00:00<?, ?it/s]sox WARN rate: rate clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
100%|██████████| 16/16 [00:00<00:00, 139.72it/s]
wav cut by vad start...

 12%|█▎        | 2/16 [00:00<00:01, 10.62it/s]
IndexError                                Traceback (most recent call last)
Cell In[8], line 4
      1 input_wav = "[./test_wavs/]("
      2 output_data = "[./output_training_data/]("
----> 4 ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.5")

File [~/.local/lib/python3.9/site-packages/modelscope/tools/](, in run_auto_label(input_wav, work_dir, para_ids, resource_model_id, resource_revision, gender, stage, process_num, develop_mode, has_para, enable_enh)
     63 model_resource = _download_and_unzip_resousrce(resource_model_id,
     64                                                resource_revision)
     65 auto_labeling = AutoLabeling(
     66     os.path.abspath(input_wav),
     67     model_resource,
     75     process_num,
     76     enable_enh=enable_enh)
---> 77 ret_code, report =
     78 return ret_code, report

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in
    762 self.wav_preprocess()
    764 ## cut wav by vad
--> 765 self.wav_cut_by_vad()
    767 # get prosody
    768 audio_path = glob.glob(os.path.join(self.cut_wav_dir, '*.wav'))

File [~/.local/lib/python3.9/site-packages/tts_autolabel/](, in AutoLabeling.wav_cut_by_vad(self)
    369     shutil.rmtree(self.cut_wav_dir)
    370 os.makedirs(self.cut_wav_dir, exist_ok=True)
--> 371 vad_cut(self.resample_wav_dir, self.cut_wav_dir, self.resource_dir)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audiocut/](, in vad_cut(input_wav_dir, output_wav_dir, resource_dir, cut_threshold, start_sil_threshold, end_sil_threshold, max_dur_threshold, min_dur_threshold)
     69 min_samples_threshold = int(min_dur_threshold * sample_rate)
     71 wavid = os.path.basename(audio_in).split('.')[0]
---> 73 segments_result = vad_pipeline(audio_in=waveform) 
     74 segments_text = segments_result[0]
     76 if len(segments_text) == 0:

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/](, in Fsmn_vad.__call__(self, audio_in, **kwargs)
     92 end_idx = min(waveform_nums, beg_idx + self.batch_size)
     93 waveform = waveform_list[beg_idx:end_idx]
---> 94 feats, feats_len = self.extract_feat(waveform)
     95 waveform = np.array(waveform)
     96 param_dict = kwargs.get('param_dict', dict())

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/](, in Fsmn_vad.extract_feat(self, waveform_list)
    152 for waveform in waveform_list:
    153   speech, _ = self.frontend.fbank(waveform)
--> 154   feat, feat_len = self.frontend.lfr_cmvn(speech)
    155   feats.append(feat)
    156   feats_len.append(feat_len)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/](, in WavFrontend.lfr_cmvn(self, feat)
     87 def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     88     if self.lfr_m != 1 or self.lfr_n != 1:
---> 89         feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
     91     if self.cmvn_file:
     92         feat = self.apply_cmvn(feat)

File [~/.local/lib/python3.9/site-packages/tts_autolabel/audio2phone/funasr_onnx/utils/](, in WavFrontend.apply_lfr(inputs, lfr_m, lfr_n)
    101 T = inputs.shape[0]
    102 T_lfr = int(np.ceil(T [/]( lfr_n))
--> 103 left_padding = np.tile(inputs[0], ((lfr_m - 1) [/]([/]( 2, 1))
    104 inputs = np.vstack((left_padding, inputs))
    105 T = T + (lfr_m - 1) [/]([/]( 2

IndexError: index 0 is out of bounds for axis 0 with size 0

楼主解决了吗?我也是尝试本地部署。但是卡在了这个auto label步骤。