Closed kyakuno closed 4 months ago
カテゴリはnatural_language_processing。
@ooe1123 GPT SoVITSの英語の前処理に使えないかと考えていまして、こちらのモデルをお願いすることは可能でしょうか?
〇 speechbrain/inference/text.py
class GraphemeToPhoneme(Pretrained, EncodeDecodePipelineMixin):
...
def g2p(self, text):
...
model_outputs = self.mods.model(**model_inputs)
model_outputs = self.mods.model(
grapheme_encoded=model_inputs["grapheme_encoded"],
word_emb=model_inputs["word_emb"],
)
↓
class GraphemeToPhoneme(Pretrained, EncodeDecodePipelineMixin):
def g2p(self, text):
...
self.mods.model.forward_org = self.mods.model.forward
self.mods.model.forward = self.mods.model.forward_upd
self.mods.model.forward_org = self.mods.model.forward
self.mods.model.forward = self.mods.model.forward_upd
if 1:
class Exp(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, grapheme_encoded, word_emb):
model_outputs = self.model(grapheme_encoded, word_emb)
return model_outputs[0], model_outputs[2], model_outputs[3]
with torch.no_grad():
print("------>")
from torch.autograd import Variable
model = Exp(self.mods.model)
x = (model_inputs["grapheme_encoded"].data, model_inputs["word_emb"].data)
torch.onnx.export(
model, x, 'soundchoice-g2p_atn.onnx',
input_names=["grapheme_encoded", "word_emb"],
output_names=["p_seq","encoder_out","w"],
dynamic_axes={
'grapheme_encoded': {1: 'len'}, 'word_emb': {1: 'len'},
'p_seq': {2: 'p_seq_len'},
'encoder_out': {1: 'len'}, 'w': {2: 'len'}},
verbose=False, opset_version=17
)
print("<------")
exit()
〇 speechbrain/lobes/models/g2p/model.py
class AttentionSeq2Seq(nn.Module):
...
def forward(
self, grapheme_encoded, phn_encoded=None, word_emb=None, **kwargs
):
...
↓
class AttentionSeq2Seq(nn.Module):
...
def forward_upd(self, grapheme_encoded, word_emb):
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def __iter__(self):
return iter((getattr(self, key) for key in self.keys()))
lengths = torch.tensor([1]).to(grapheme_encoded.device)
grapheme_encoded = AttrDict(data=grapheme_encoded, lengths=lengths)
word_emb = AttrDict(data=word_emb, lengths=lengths)
return self.forward_org(grapheme_encoded, word_emb=word_emb)
def forward(
self, grapheme_encoded, phn_encoded=None, word_emb=None, **kwargs
):
...
〇 speechbrain/wordemb/transformer.py
class TransformerWordEmbeddings(nn.Module):
...
def embeddings(self, sentence):
...
with torch.no_grad():
output = self.model(**self._to_device(encoded))
↓
class TransformerWordEmbeddings(nn.Module):
...
def embeddings(self, sentence):
...
with torch.no_grad():
if 1:
class Exp(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, input_ids, attention_mask, token_type_ids):
encoded = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
output = self.model(**encoded)
states = torch.stack(output.hidden_states)
return states
print("------>")
model = Exp(self.model)
encoded = self._to_device(encoded)
x = (encoded["input_ids"], encoded["attention_mask"], encoded["token_type_ids"])
torch.onnx.export(
model, x, 'soundchoice-g2p_emb.onnx',
input_names=["input_ids", "attention_mask", "token_type_ids"],
output_names=["hidden_states"],
dynamic_axes={'input_ids': {1: 'len'}, 'attention_mask': {1: 'len'}, 'token_type_ids': {1: 'len'}, 'hidden_states': {2: 'len'}},
verbose=False, opset_version=17
)
print("<------")
exit()
〇 speechbrain/decoders/seq2seq.py
class S2SBeamSearcher(S2SBaseSearcher):
...
def _attn_weight_step(
self, inp_tokens, memory, enc_states, enc_lens, attn, log_probs
):
if self.attn_weight > 0:
log_probs, memory, attn = self.forward_step(
inp_tokens, memory, enc_states, enc_lens
)
↓
class S2SBeamSearcher(S2SBaseSearcher):
...
def _attn_weight_step(
self, inp_tokens, memory, enc_states, enc_lens, attn, log_probs
):
if self.attn_weight > 0:
if 1:
class Exp(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, inp_tokens, hs, c, enc_states, enc_lens):
memory = (hs, c)
log_probs, memory, attn = self.model.forward_step(
inp_tokens, memory, enc_states, enc_lens
)
hs, c = memory
return (log_probs, hs, c, attn)
with torch.no_grad():
print("------>")
from torch.autograd import Variable
model = Exp(self)
hs, c = memory
x = (inp_tokens, torch.zeros((4, 16, 512)).to(c.device), c, enc_states, enc_lens)
torch.onnx.export(
model, x, 'rnn_beam_searcher.onnx',
input_names=["inp_tokens", "in_hs", "in_c", "enc_states", "enc_lens"],
output_names=["log_probs", "hs", "c", "attn"],
dynamic_axes={'inp_tokens': [0], "in_hs": [0, 1], "in_c":[0], "enc_states":[0, 1], 'enc_lens': [0], "log_probs":[0,1], "hs":[0, 1], "c":[0], "attn":[0,1]},
verbose=False, opset_version=17
)
print("<------")
exit()
英語の音素変換モデル。T2Sの前処理などに使用。 https://huggingface.co/speechbrain/soundchoice-g2p https://github.com/speechbrain/speechbrain/tree/main/recipes/LibriSpeech/G2P