RVC-Boss / GPT-SoVITS

1 min voice data can also be used to train a good TTS model! (few shot voice cloning)
MIT License
32.61k stars 3.76k forks source link

分享API支持多音字列表(自定义读音)的一种实现方式 #1273

Open AnonymousmousCoder opened 2 months ago

AnonymousmousCoder commented 2 months ago

在https://github.com/RVC-Boss/GPT-SoVITS/issues/1175 的分享上进一步完善,增加了对轻音的支持。

在函数调用的传参时,增加polyphone_dict多音字字典参数。

    inputs={
        "gpt_path":new_gpt_path,
        "sovits_path":sovits_path,
        "text": text,
        "text_lang": 'zh',
        "ref_audio_path": ref_audio_path,
        "prompt_text": prompt_text if not ref_text_free else "",
        "prompt_lang": 'zh',
        "top_k": top_k,
        "top_p": top_p,
        "temperature": temperature,
        "text_split_method": 'cut3',
        "batch_size":int(batch_size),
        "speed_factor":float(speed_factor),
        "pitch":pitch, 
        "split_bucket":split_bucket,
        "return_fragment":False,
        "fragment_interval":fragment_interval,
        "polyphone_dict":polyphone_dict
    }
    yield next(tts_pipline.run(inputs))

polyphone_dict的参数格式为

 polyphone_dict = {
        "丫鬟": "丫<tone as=huan5>鬟</tone>",
        "丫环": "丫<tone as=huan5>环</tone>",
        "数一数": "<tone as=shu3>数</tone>一<tone as=shu3>数</tone>",
         "王爷": "王<tone as=ye5>爷</tone>",
         }

以下为TextPreprocessor.py核心代码,polyphone_dict从TTS.py一路传过来就好。

    def extract_bert_feature(self, textlist:list, langlist:list,polyphone_dict={}):
        phones_list = []
        bert_feature_list = []
        norm_text_list = []
        for i in range(len(textlist)):
            lang = langlist[i]
            phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang,polyphone_dict)
            _bert_feature = self.get_bert_inf(phones, word2ph, norm_text, lang)
            # phones_list.append(phones)
            phones_list.extend(phones)
            norm_text_list.append(norm_text)
            bert_feature_list.append(_bert_feature)
        bert_feature = torch.cat(bert_feature_list, dim=1)
        # phones = sum(phones_list, [])
        norm_text = ''.join(norm_text_list)
        return phones_list, bert_feature, norm_text

    def clean_text_inf(self,text, language,polyphone_dict={}):
        text, tone_data_list = self.find_custom_tone1(text,polyphone_dict)
        # print("多音字查找",text, tone_data_list)
        phones, word2ph, norm_text = clean_text(text, language)
        if len(tone_data_list) > 0:
            # print("发现有多音字",tone_data_list)
            phones = self.revise_custom_tone(phones, word2ph, tone_data_list)
        phones = cleaned_text_to_sequence(phones)
        return phones, word2ph, norm_text
def find_custom_tone1(self, text: str, polyphone_dict={}):
        """
        识别、提取文本中的多音字
        """
        print("进入多音字处理函数", polyphone_dict)

        # 先从词表中查找并替换文本中的多音字
        for word, replacement in polyphone_dict.items():
            text = text.replace(word, replacement)

        tone_list = []
        txts = []

        # 识别 tone 标记,形如<tone as=shu4>数</tone>
        ptn1 = re.compile(r"<tone.*?as=(\w+).*?>(.*?)</tone>")
        matches = list(re.finditer(ptn1, text))
        offset = 0

        for match in matches:
            # tone 标记之前的文本
            pre = text[offset:match.start()]
            txts.append(pre)

            # tone 标签中的单个多音字
            tone_text = match.group(2)
            txts.append(tone_text)

            # 提取读音
            tone = match.group(1)

            # 多音字在当前文本中的索引位置
            pos = sum([len(s) for s in txts])
            offset = match.end()

            print("tone_text,tone", tone_text, tone)
            init, final = self.get_initial_final(tone_text, tone)
            data = [tone, init, final, pos]
            print(data)
            tone_list.append(data)

        # 不能忘了最后一个 tone 标签后面可能还有剩余的内容
        if offset < len(text):
            txts.append(text[offset:])

        text = ''.join(str(i) for i in txts)
        text = text.replace(" ", "")  # 去除空格
        return text, tone_list

    def get_initial_final(self,wd, py):
        """
        根据自定义的多音字读音匹配正确的声母、韵母。检查时的重难点,声母韵母很容易和预设的推理机制不一致。除了标点符号,return长度都得是2。
        """
        # 声母列表
        initials = pinyin(wd, heteronym=True, neutral_tone_with_five=True, style=Style.INITIALS,strict=False)[0]
        # print(initials)

        for init in initials:
            if(py.startswith(init) and init!=''):
                final = py[len(init):]
                return init ,final
        if('' in initials):
            return py[0].upper() + py[0].upper() ,py
        if(len(py)>1):
            return py[0],py[1:]
        else:
            return '',''

    def revise_custom_tone(self,phones, word2ph, tone_data_list):
        """
        修正自定义多音字
        """
        for td in tone_data_list:
            # print("尝试更正",td)
            tone = td[0]
            init = td[1]
            final = td[2]
            pos = td[3]
            if init == "" and final == "":
                # 如果匹配拼音的时候失败,这里保持模型中默认提供的读音
                continue

            wd_pos = 0
            for i in range(0, pos):
                wd_pos += word2ph[i]
            org_init = phones[wd_pos - 2]
            org_final = phones[wd_pos - 1]
            phones[wd_pos - 2] = init
            phones[wd_pos - 1] = final
            print(f"[+]成功修改读音: {org_init}{org_final} => {tone}")
            return phones
KevinZhang19870314 commented 2 months ago

我觉得你可以提pr吧,貌似开个issue没什么用吧

KamioRinn commented 2 months ago

多音字可以直接在G2P插入字典进行实现,不用这么麻烦去绕一圈。字典加载方式参考pypinyin加载自定义字典,或者https://github.com/RVC-Boss/GPT-SoVITS/pull/488

einsqing commented 4 weeks ago

多音字可以直接在G2P插入字典进行实现,不用这么麻烦去绕一圈。字典加载方式参考pypinyin加载自定义字典,或者#488

如何简单实现的?我发现v2 多音字读的不对,想改一下