rhasspy / piper

A fast, local neural text to speech system
https://rhasspy.github.io/piper-samples/
MIT License
6.5k stars 478 forks source link

Loss of punctuation and unable to read the numerical letters #63

Open shahizat opened 1 year ago

shahizat commented 1 year ago

Hi @synesthesiam,

It is not a critical issue, if you happen to know the answer.

I modified your python_run code to accept only numpy arrays for speech synthesis, but as a result, I lost the ability to synthesize numerical letters in the Kazakh model. However, the English model is functioning correctly.

I suspect that I also lost the functionality of the auto punctuation. What do you think about it?

Here is the code

import io
import json
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import List, Mapping, Optional, Sequence, Union
import numpy as np
import onnxruntime
from espeak_phonemizer import Phonemizer
import argparse
import logging
import sys
import time
from functools import partial
import sounddevice as sd
from pydub import AudioSegment

fs = 22050

_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)

_BOS = "^"
_EOS = "$"
_PAD = "_"

@dataclass
class PiperConfig:
    num_symbols: int
    num_speakers: int
    sample_rate: int
    espeak_voice: str
    length_scale: float
    noise_scale: float
    noise_w: float
    phoneme_id_map: Mapping[str, Sequence[int]]

class Piper:
    def __init__(
        self,
        model_path: Union[str, Path],
        config_path: Optional[Union[str, Path]] = None,
        use_cuda: bool = False,
    ):
        if config_path is None:
            config_path = f"{model_path}.json"

        self.config = load_config(config_path)
        self.phonemizer = Phonemizer(self.config.espeak_voice)
        self.model = onnxruntime.InferenceSession(
            str(model_path),
            sess_options=onnxruntime.SessionOptions(),
            providers=["CPUExecutionProvider"]
            if not use_cuda
            else ["CUDAExecutionProvider"],
        )

    def synthesize(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        length_scale: Optional[float] = None,
        noise_scale: Optional[float] = None,
        noise_w: Optional[float] = None,
    ) -> bytes:
        """Synthesize WAV audio from text."""
        if length_scale is None:
            length_scale = self.config.length_scale

        if noise_scale is None:
            noise_scale = self.config.noise_scale

        if noise_w is None:
            noise_w = self.config.noise_w

        phonemes_str = self.phonemizer.phonemize(text)
        phonemes = [_BOS] + list(phonemes_str)
        phoneme_ids: List[int] = []

        for phoneme in phonemes:
            phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
            phoneme_ids.extend(self.config.phoneme_id_map[_PAD])

        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])

        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
        scales = np.array(
            [noise_scale, length_scale, noise_w],
            dtype=np.float32,
        )

        # if (self.config.num_speakers > 1) and (speaker_id is not None):
        #     # Default speaker
        #     speaker_id = 0

        sid = None

        if speaker_id is not None:
            sid = np.array([speaker_id], dtype=np.int64)

        # Synthesize through Onnx
        audio = self.model.run(
            None,
            {
                "input": phoneme_ids_array,
                "input_lengths": phoneme_ids_lengths,
                "scales": scales,
                "sid": sid,
            },
        )[0].squeeze((0, 1))
        audio = audio_float_to_int16(audio.squeeze())
        return audio, self.config.sample_rate

def load_config(config_path: Union[str, Path]) -> PiperConfig:
    with open(config_path, "r", encoding="utf-8") as config_file:
        config_dict = json.load(config_file)
        inference = config_dict.get("inference", {})

        return PiperConfig(
            num_symbols=config_dict["num_symbols"],
            num_speakers=config_dict["num_speakers"],
            sample_rate=config_dict["audio"]["sample_rate"],
            espeak_voice=config_dict["espeak"]["voice"],
            noise_scale=inference.get("noise_scale", 0.667),
            length_scale=inference.get("length_scale", 1.0),
            noise_w=inference.get("noise_w", 0.8),
            phoneme_id_map=config_dict["phoneme_id_map"],
        )

def audio_float_to_int16(
    audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
    """Normalize audio and convert to int16 range"""
    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
    audio_norm = audio_norm.astype("int16")
    return audio_norm

if __name__ == "__main__":
    model = './kk-issai-high.onnx'
    speaker_id=1
    voice = Piper(model)
    synthesize = partial(
        voice.synthesize,
        speaker_id=speaker_id,
        length_scale=None, 
        noise_scale=None,
        noise_w=None,
    )
    text = "Шоқан Шыңғысұлы Уәлиханов (шын есімі Мұхаммед Қанафия; 1835 —1865)"
    audio_norm, sample_rate = synthesize(text)
    sd.play(audio_norm, sample_rate, blocking=True) 

Best regards, Shakhizat

synesthesiam commented 1 year ago

I can't see anything obviously wrong. What output do you get?

shahizat commented 1 year ago

Hi @synesthesiam, thanks for your reply. Kazakh model can not synthesize the numbers, it basically skips it. However, russian and english models can synthesize numbers.

shahizat commented 1 year ago

Hi @synesthesiam,

I've also observed that, below comand using your src/python_run example also can not read numberical letters with more than one digit numbers, but your binary piper example can synthesize.

echo '3 4 4 4 5 1835 1865 Шоқан Шыңғысұлы Уәлиханов шын есімі Мұхаммед Қанафия 1835 1865' | scripts/piper --model kk-issai-high.onnx --output_file welcome.wav --speaker 1

It reads 3 4 4 4 5 numbers, skips 1835 1865 and then continues Шоқан Шыңғысұлы....

Best regards, Shakhizat