RVC-Boss / GPT-SoVITS

1 min voice data can also be used to train a good TTS model! (few shot voice cloning)
MIT License
32.05k stars 3.69k forks source link

请问一下,api_v2.py和api_v3.py怎么用呀? #1201

Open lckj2009 opened 2 months ago

lckj2009 commented 2 months ago

能否给个教程

Separatee commented 2 months ago

你但凡点开看看源码就不会问了 源码里有使用说明.

xiaokang00010 commented 1 month ago

自己随手写了个Python binding,可以参考一下

import requests
import urllib.parse

import logger

class GPTSoVitsAPI():
    """
    GPTSoVits API class.

    Attributes:
        api_url (str): Url of the GPTSoVits API.
        usingV3 (bool): whether using APIv3.py from the fast_inference_ branch.
        ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.

    Methods:
        tts_v1(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the classic v1 API.
        tts_v3(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the v3 API from the fast_inference_ branch.
        tts(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference based on the API version.
        changeReferenceAudio(ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None: Change reference audio.
        control(command: str): Restart or exit.
    """

    def __init__(self, api_url: str, isTTSv3: bool = False, ttsInferYamlPath: str = "") -> None:
        """
        Initialize the GPTSoVitsAPI.

        Args:
            api_url (str): Url of the GPTSoVits API.
            isTTSv3 (bool): whether using APIv3.py from the fast_inference_ branch. Defaults to False.
            ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.
        """
        self.api_url = api_url
        self.usingV3 = isTTSv3
        self.ttsInferYamlPath = ttsInferYamlPath
        if isTTSv3:
            logger.Logger.log('Using v3 API from the fast_inference_ branch')
        else:
            logger.Logger.log('Using classic v1 API from the main branch')

    # text to speech function for v1 API
    def tts_v1(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the classic v1 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.post(f'{self.api_url}/', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language,
            "text": text,
            "text_language": text_language
        }, stream=True)

    def build_tts_v3_request(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> str:
        """
        Build the request for the v3 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            str: Request string for the v3 API.
        """
        return f'{self.api_url}/tts?{urllib.parse.urlencode({
             "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })}'

    def tts_v3(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the v3 API from the fast_inference_ branch.

        Args:
            ref_audio (str): Reference audio path. Should be less than 10 seconds long.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.get(f'{self.api_url}/tts', params={
            "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })

    def tts(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference based on the API version.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        if self.usingV3:
            return self.tts_v3(ref_audio, ref_text, text, ref_language, text_language)
        else:
            return self.tts_v1(ref_audio, ref_text, text, ref_language, text_language)

    # change reference audio
    def changeReferenceAudio(self, ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None:
        r = requests.post(f'{self.api_url}/change_ref', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language
        })
        if r.status_code == 400:
            raise RuntimeError(f'{__name__}: Failed to change reference audio')
        else:
            return

    # restart or exit
    def control(self, command: str):
        requests.post(f'{self.api_url}/control', json={
            "command": command})
lckj2009 commented 1 month ago

自己随手写了个Python binding,可以参考一下

import requests
import urllib.parse

import logger

class GPTSoVitsAPI():
    """
    GPTSoVits API class.

    Attributes:
        api_url (str): Url of the GPTSoVits API.
        usingV3 (bool): whether using APIv3.py from the fast_inference_ branch.
        ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.

    Methods:
        tts_v1(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the classic v1 API.
        tts_v3(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the v3 API from the fast_inference_ branch.
        tts(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference based on the API version.
        changeReferenceAudio(ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None: Change reference audio.
        control(command: str): Restart or exit.
    """

    def __init__(self, api_url: str, isTTSv3: bool = False, ttsInferYamlPath: str = "") -> None:
        """
        Initialize the GPTSoVitsAPI.

        Args:
            api_url (str): Url of the GPTSoVits API.
            isTTSv3 (bool): whether using APIv3.py from the fast_inference_ branch. Defaults to False.
            ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.
        """
        self.api_url = api_url
        self.usingV3 = isTTSv3
        self.ttsInferYamlPath = ttsInferYamlPath
        if isTTSv3:
            logger.Logger.log('Using v3 API from the fast_inference_ branch')
        else:
            logger.Logger.log('Using classic v1 API from the main branch')

    # text to speech function for v1 API
    def tts_v1(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the classic v1 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.post(f'{self.api_url}/', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language,
            "text": text,
            "text_language": text_language
        }, stream=True)

    def build_tts_v3_request(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> str:
        """
        Build the request for the v3 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            str: Request string for the v3 API.
        """
        return f'{self.api_url}/tts?{urllib.parse.urlencode({
             "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })}'

    def tts_v3(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the v3 API from the fast_inference_ branch.

        Args:
            ref_audio (str): Reference audio path. Should be less than 10 seconds long.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.get(f'{self.api_url}/tts', params={
            "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })

    def tts(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference based on the API version.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        if self.usingV3:
            return self.tts_v3(ref_audio, ref_text, text, ref_language, text_language)
        else:
            return self.tts_v1(ref_audio, ref_text, text, ref_language, text_language)

    # change reference audio
    def changeReferenceAudio(self, ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None:
        r = requests.post(f'{self.api_url}/change_ref', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language
        })
        if r.status_code == 400:
            raise RuntimeError(f'{__name__}: Failed to change reference audio')
        else:
            return

    # restart or exit
    def control(self, command: str):
        requests.post(f'{self.api_url}/control', json={
            "command": command})

你好,这个跟网上的api2.py好像不同,不是通过参考音频和文字进行推理的。[api_v2.py和api_v3.py到底是做什么用的我还不明白

Separatee commented 1 month ago

自己随手写了个Python binding,可以参考一下

import requests
import urllib.parse

import logger

class GPTSoVitsAPI():
    """
    GPTSoVits API class.

    Attributes:
        api_url (str): Url of the GPTSoVits API.
        usingV3 (bool): whether using APIv3.py from the fast_inference_ branch.
        ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.

    Methods:
        tts_v1(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the classic v1 API.
        tts_v3(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference using the v3 API from the fast_inference_ branch.
        tts(ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response: Run TTS inference based on the API version.
        changeReferenceAudio(ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None: Change reference audio.
        control(command: str): Restart or exit.
    """

    def __init__(self, api_url: str, isTTSv3: bool = False, ttsInferYamlPath: str = "") -> None:
        """
        Initialize the GPTSoVitsAPI.

        Args:
            api_url (str): Url of the GPTSoVits API.
            isTTSv3 (bool): whether using APIv3.py from the fast_inference_ branch. Defaults to False.
            ttsInferYamlPath (str): TTS inference YAML path. Necessary for v3 API.
        """
        self.api_url = api_url
        self.usingV3 = isTTSv3
        self.ttsInferYamlPath = ttsInferYamlPath
        if isTTSv3:
            logger.Logger.log('Using v3 API from the fast_inference_ branch')
        else:
            logger.Logger.log('Using classic v1 API from the main branch')

    # text to speech function for v1 API
    def tts_v1(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the classic v1 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.post(f'{self.api_url}/', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language,
            "text": text,
            "text_language": text_language
        }, stream=True)

    def build_tts_v3_request(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> str:
        """
        Build the request for the v3 API.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            str: Request string for the v3 API.
        """
        return f'{self.api_url}/tts?{urllib.parse.urlencode({
             "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })}'

    def tts_v3(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference using the v3 API from the fast_inference_ branch.

        Args:
            ref_audio (str): Reference audio path. Should be less than 10 seconds long.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        return requests.get(f'{self.api_url}/tts', params={
            "text": text,
            "text_lang": text_language,
            "ref_audio_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_lang": ref_language,
            "media_type": "aac",
            "streaming_mode": True,
            "parallel_infer": True,
            "tts_infer_yaml_path": self.ttsInferYamlPath
        })

    def tts(self, ref_audio: str, ref_text: str, text: str, ref_language: str = 'auto', text_language: str = 'auto') -> requests.Response:
        """
        Run TTS inference based on the API version.

        Args:
            ref_audio (str): Reference audio path.
            ref_text (str): Reference text.
            text (str): Text to be synthesized.
            ref_language (str, optional): Reference audio language. Defaults to 'auto'.
            text_language (str, optional): Text language. Defaults to 'auto'.

        Returns:
            requests.Response: Response object from the API.
        """
        if self.usingV3:
            return self.tts_v3(ref_audio, ref_text, text, ref_language, text_language)
        else:
            return self.tts_v1(ref_audio, ref_text, text, ref_language, text_language)

    # change reference audio
    def changeReferenceAudio(self, ref_audio: str, ref_text: str, ref_language: str = 'auto') -> None:
        r = requests.post(f'{self.api_url}/change_ref', json={
            "refer_wav_path": ref_audio,
            "prompt_text": ref_text,
            "prompt_language": ref_language
        })
        if r.status_code == 400:
            raise RuntimeError(f'{__name__}: Failed to change reference audio')
        else:
            return

    # restart or exit
    def control(self, command: str):
        requests.post(f'{self.api_url}/control', json={
            "command": command})

你好,这个跟网上的api2.py好像不同,不是通过参考音频和文字进行推理的。[api_v2.py和api_v3.py到底是做什么用的我还不明白

很简单,就是api.py的更新版本,但出于测试用途,版本都被保留了下来.毕竟_fast_inference也是非主分支仓库