axinc-ai / ailia-models

The collection of pre-trained, state-of-the-art AI models for ailia SDK
2.04k stars 325 forks source link

ADD soundchoice-g2p #1479

Closed kyakuno closed 4 months ago

kyakuno commented 5 months ago

英語の音素変換モデル。T2Sの前処理などに使用。 https://huggingface.co/speechbrain/soundchoice-g2p https://github.com/speechbrain/speechbrain/tree/main/recipes/LibriSpeech/G2P

kyakuno commented 5 months ago

カテゴリはnatural_language_processing。

kyakuno commented 5 months ago

@ooe1123 GPT SoVITSの英語の前処理に使えないかと考えていまして、こちらのモデルをお願いすることは可能でしょうか?

ooe1123 commented 4 months ago

soundchoice-g2p_atn.onnx

〇 speechbrain/inference/text.py

class GraphemeToPhoneme(Pretrained, EncodeDecodePipelineMixin):
    ...
    def g2p(self, text):
        ...
        model_outputs = self.mods.model(**model_inputs)
        model_outputs = self.mods.model(
            grapheme_encoded=model_inputs["grapheme_encoded"], 
            word_emb=model_inputs["word_emb"],
        )

class GraphemeToPhoneme(Pretrained, EncodeDecodePipelineMixin):
    def g2p(self, text):
        ...
        self.mods.model.forward_org = self.mods.model.forward
        self.mods.model.forward = self.mods.model.forward_upd
        self.mods.model.forward_org = self.mods.model.forward
        self.mods.model.forward = self.mods.model.forward_upd
        if 1:
            class Exp(torch.nn.Module):
                def __init__(self, model):
                    super().__init__()
                    self.model = model
                def forward(self, grapheme_encoded, word_emb):
                    model_outputs = self.model(grapheme_encoded, word_emb)
                    return model_outputs[0], model_outputs[2], model_outputs[3]

            with torch.no_grad():
                print("------>")
                from torch.autograd import Variable
                model = Exp(self.mods.model)
                x = (model_inputs["grapheme_encoded"].data, model_inputs["word_emb"].data)
                torch.onnx.export(
                    model, x, 'soundchoice-g2p_atn.onnx',
                    input_names=["grapheme_encoded", "word_emb"],
                    output_names=["p_seq","encoder_out","w"],
                    dynamic_axes={
                        'grapheme_encoded': {1: 'len'}, 'word_emb': {1: 'len'},
                        'p_seq': {2: 'p_seq_len'}, 
                        'encoder_out': {1: 'len'}, 'w': {2: 'len'}},
                    verbose=False, opset_version=17
                )
                print("<------")
                exit()

〇 speechbrain/lobes/models/g2p/model.py

class AttentionSeq2Seq(nn.Module):
    ...
    def forward(
        self, grapheme_encoded, phn_encoded=None, word_emb=None, **kwargs
    ):
        ...

class AttentionSeq2Seq(nn.Module):
    ...
    def forward_upd(self, grapheme_encoded, word_emb):
        class AttrDict(dict):
            def __init__(self, *args, **kwargs):
                super(AttrDict, self).__init__(*args, **kwargs)
                self.__dict__ = self
            def __iter__(self):
                return iter((getattr(self, key) for key in self.keys()))

        lengths = torch.tensor([1]).to(grapheme_encoded.device)
        grapheme_encoded = AttrDict(data=grapheme_encoded, lengths=lengths)
        word_emb = AttrDict(data=word_emb, lengths=lengths)
        return self.forward_org(grapheme_encoded, word_emb=word_emb)

    def forward(
        self, grapheme_encoded, phn_encoded=None, word_emb=None, **kwargs
    ):
        ...
ooe1123 commented 4 months ago

soundchoice-g2p_emb.onnx

〇 speechbrain/wordemb/transformer.py

class TransformerWordEmbeddings(nn.Module):
    ...
    def embeddings(self, sentence):
        ...
        with torch.no_grad():
            output = self.model(**self._to_device(encoded))

class TransformerWordEmbeddings(nn.Module):
    ...
    def embeddings(self, sentence):
        ...
        with torch.no_grad():
            if 1:
                class Exp(torch.nn.Module):
                    def __init__(self, model):
                        super().__init__()
                        self.model = model

                    def forward(self, input_ids, attention_mask, token_type_ids):
                        encoded = {
                            "input_ids": input_ids,
                            "attention_mask": attention_mask,
                            "token_type_ids": token_type_ids,
                        }
                        output = self.model(**encoded)
                        states = torch.stack(output.hidden_states)
                        return states

                print("------>")
                model = Exp(self.model)
                encoded = self._to_device(encoded)
                x = (encoded["input_ids"], encoded["attention_mask"], encoded["token_type_ids"])
                torch.onnx.export(
                    model, x, 'soundchoice-g2p_emb.onnx',
                    input_names=["input_ids", "attention_mask", "token_type_ids"],
                    output_names=["hidden_states"],
                    dynamic_axes={'input_ids': {1: 'len'}, 'attention_mask': {1: 'len'}, 'token_type_ids': {1: 'len'}, 'hidden_states': {2: 'len'}},
                    verbose=False, opset_version=17
                )
                print("<------")
                exit()

rnn_beam_searcher.onnx.prototxt

〇 speechbrain/decoders/seq2seq.py

class S2SBeamSearcher(S2SBaseSearcher):
    ...
    def _attn_weight_step(
        self, inp_tokens, memory, enc_states, enc_lens, attn, log_probs
    ):
        if self.attn_weight > 0:
            log_probs, memory, attn = self.forward_step(
                inp_tokens, memory, enc_states, enc_lens
            )

class S2SBeamSearcher(S2SBaseSearcher):
    ...
    def _attn_weight_step(
        self, inp_tokens, memory, enc_states, enc_lens, attn, log_probs
    ):
        if self.attn_weight > 0:
            if 1:
                class Exp(torch.nn.Module):
                    def __init__(self, model):
                        super().__init__()
                        self.model = model

                    def forward(self, inp_tokens, hs, c, enc_states, enc_lens):
                        memory = (hs, c)
                        log_probs, memory, attn = self.model.forward_step(
                            inp_tokens, memory, enc_states, enc_lens
                        )
                        hs, c = memory
                        return (log_probs, hs, c, attn)

                with torch.no_grad():
                    print("------>")
                    from torch.autograd import Variable
                    model = Exp(self)
                    hs, c = memory
                    x = (inp_tokens, torch.zeros((4, 16, 512)).to(c.device), c, enc_states, enc_lens)
                    torch.onnx.export(
                        model, x, 'rnn_beam_searcher.onnx',
                        input_names=["inp_tokens", "in_hs", "in_c", "enc_states", "enc_lens"],
                        output_names=["log_probs", "hs", "c", "attn"],
                        dynamic_axes={'inp_tokens': [0], "in_hs": [0, 1], "in_c":[0], "enc_states":[0, 1], 'enc_lens': [0], "log_probs":[0,1], "hs":[0, 1], "c":[0], "attn":[0,1]},
                        verbose=False, opset_version=17
                    )
                    print("<------")
                    exit()