Easy-to-use Speech Toolkit including Self-Supervised Learning model, SOTA/Streaming ASR with punctuation, Streaming TTS with text frontend, Speaker Verification System, End-to-End Speech Translation and Keyword Spotting. Won NAACL2022 Best Demo Award.
我使用了paddle官网教程(https://aistudio.baidu.com/aistudio/projectdetail/6572581)的代码:
import paddle
import yaml
import soundfile as sf
from yacs.config import CfgNode
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.exps.syn_utils import run_frontend
sentence = "我有一本书,名字叫《The Little Prince》. 这本书售价15.8元。"
import IPython.display as dp
dp.Audio(wav.T, rate=am_config.fs)
为什么程序运行后会出现下面这个错误?
Traceback (most recent call last):
File "E:\deepLearning\paddleLearnning\TTS\trainModel\mix\script.py", line 64, in
mel = am_inference(part_phone_ids, spk_id)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddle\nn\layer\layers.py", line 1254, in call
return self.forward(*inputs, **kwargs)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 895, in forward
normalized_mel, d_outs, p_outs, e_outs = self.acousticmodel.inference(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 786, in inference
, outs, d_outs, p_outs, e_outs = self._forward(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 594, in _forward
hs = self._integrate_with_spk_embed(hs, spk_emb)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 815, in _integrate_with_spk_embed
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddle\nn\functional\norm.py", line 82, in normalize
out = _C_ops.p_norm(x, float(p), axis, epsilon, True, False)
ValueError: (InvalidArgument) Attr(axis) value should be in range [-R, R-1], R is the rank of Input(X). But received axis: 1, R: 1. Current Input(X)'s shape is=[256].
[Hint: Expected axis < x_rank, but received axis:1 >= x_rank:1.] (at ..\paddle\phi\infermeta\unary.cc:2763)
我使用了paddle官网教程(https://aistudio.baidu.com/aistudio/projectdetail/6572581)的代码: import paddle import yaml import soundfile as sf from yacs.config import CfgNode from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.exps.syn_utils import run_frontend
sentence = "我有一本书,名字叫《The Little Prince》. 这本书售价15.8元。"
text frontend
phones_dict = "download/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt" frontend = MixFrontend(phone_vocab_path=phones_dict) print("frontend done!")
load AM
am_config_file = "download/fastspeech2_mix_ckpt_1.2.0/default.yaml" am_ckpt = "download/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz" am_stat = "download/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy" speaker_dict = "download/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt" with open(am_config_file) as f: am_config = CfgNode(yaml.safe_load(f)) am_inference = get_am_inference( am="fastspeech2_mix", am_config=am_config, am_ckpt=am_ckpt, am_stat=am_stat, phones_dict=phones_dict, tones_dict=None, speaker_dict=speaker_dict) print("acoustic model done!")
load Voc
voc_config_file = "download/hifigan_aishell3_ckpt_0.2.0/default.yaml" voc_ckpt = "download/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz" voc_stat = "download/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy" with open(voc_config_file) as f: voc_config = CfgNode(yaml.safe_load(f)) voc_inference = get_voc_inference( voc="hifigan_aishell3", voc_config=voc_config, voc_ckpt=voc_ckpt, voc_stat=voc_stat) print("voc done!")
get phone id
frontend_dict = run_frontend( frontend=frontend, text=sentence, merge_sentences=False, get_tone_ids=False, lang="mix") phone_ids = frontend_dict['phone_ids']
inference
flags = 0 for i in range(len(phone_ids)): part_phone_ids = phone_ids[i] spk_id = 174 # baker:174, ljspeech:175, aishell3:0~173, vctk:176~282 spk_id = paddle.to_tensor(spk_id) mel = am_inference(part_phone_ids, spk_id) wav = voc_inference(mel) if flags == 0: wav_all = wav flags = 1 else: wav_all = paddle.concat([wav_all, wav]) print("infer successfully.")
save audio
wav = wav_all.numpy() sf.write("./out.wav", wav, am_config.fs)
play audio
import IPython.display as dp dp.Audio(wav.T, rate=am_config.fs)
为什么程序运行后会出现下面这个错误? Traceback (most recent call last): File "E:\deepLearning\paddleLearnning\TTS\trainModel\mix\script.py", line 64, in
mel = am_inference(part_phone_ids, spk_id)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddle\nn\layer\layers.py", line 1254, in call
return self.forward(*inputs, **kwargs)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 895, in forward
normalized_mel, d_outs, p_outs, e_outs = self.acousticmodel.inference(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 786, in inference
, outs, d_outs, p_outs, e_outs = self._forward(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 594, in _forward
hs = self._integrate_with_spk_embed(hs, spk_emb)
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddlespeech\t2s\models\fastspeech2\fastspeech2.py", line 815, in _integrate_with_spk_embed
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
File "D:\Anaconda\envs\wuzhuangbu\lib\site-packages\paddle\nn\functional\norm.py", line 82, in normalize
out = _C_ops.p_norm(x, float(p), axis, epsilon, True, False)
ValueError: (InvalidArgument) Attr(axis) value should be in range [-R, R-1], R is the rank of Input(X). But received axis: 1, R: 1. Current Input(X)'s shape is=[256].
[Hint: Expected axis < x_rank, but received axis:1 >= x_rank:1.] (at ..\paddle\phi\infermeta\unary.cc:2763)