openvpi / DiffSinger

An advanced singing voice synthesis system with high fidelity, expressiveness, controllability and flexibility based on DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
Apache License 2.0
2.69k stars 283 forks source link

run without mel2ph raise error #39

Closed Victor-Chow closed 1 year ago

Victor-Chow commented 1 year ago
inp = {
        'text': '小酒窝长睫毛AP是你最美的记号',
        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
        'input_type': 'word'
    }
target = "/content/DiffSinger/infer_out/小酒窝.wav"
ds.DiffSingerE2EInfer.example_run(inp, target=target)
Audio(filename=target)

Output:

| load phoneme set: ['AP', 'E', 'En', 'SP', 'a', 'ai', 'an', 'ang', 'ao', 'b', 'c', 'ch', 'd', 'e', 'ei', 'en', 'eng', 'er', 'f', 'g', 'h', 'i', 'i0', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'ir', 'iu', 'j', 'k', 'l', 'm', 'n', 'o', 'ong', 'ou', 'p', 'q', 'r', 's', 'sh', 't', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn', 'w', 'x', 'y', 'z', 'zh']
| load 'model' from 'checkpoints/0116_female_triplet_ds1000/model_ckpt_steps_320000.ckpt'.
| Load HifiGAN:  checkpoints/nsf_hifigan/model
Removing weight norm...
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
[<ipython-input-2-af10976b8c2b>](https://localhost:8080/#) in <module>
      6     }
      7 target = "/content/DiffSinger/infer_out/小酒窝.wav"
----> 8 ds.DiffSingerE2EInfer.example_run(inp, target=target)
      9 Audio(filename=target)

7 frames
[/content/DiffSinger/basics/base_svs_infer.py](https://localhost:8080/#) in example_run(cls, inp, target)
    156         # call the model
    157         infer_ins = cls(hparams)
--> 158         out = infer_ins.infer_once(inp)
    159 
    160         # output to file

[/content/DiffSinger/basics/base_svs_infer.py](https://localhost:8080/#) in infer_once(self, inp, return_mel)
    145     def infer_once(self, inp, return_mel=False):
    146         inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
--> 147         output = self.forward_model(inp, return_mel=return_mel)
    148         output = self.postprocess_output(output)
    149         return output

[/content/DiffSinger/inference/ds_e2e.py](https://localhost:8080/#) in forward_model(self, inp, return_mel)
    151         spk_id = sample.get('spk_ids')
    152         with torch.no_grad():
--> 153             output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
    154                                 pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
    155                                 is_slur=sample['is_slur'],mel2ph=sample['mel2ph'])

[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

[/content/DiffSinger/src/diff/diffusion.py](https://localhost:8080/#) in forward(self, txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy, infer, **kwargs)
    236             conditioning diffusion, use fastspeech2 encoder output as the condition
    237         '''
--> 238         ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
    239                        skip_decoder=True, infer=infer, **kwargs)
    240         cond = ret['decoder_inp'].transpose(1, 2)

[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

[/content/DiffSinger/modules/naive_frontend/encoder.py](https://localhost:8080/#) in forward(self, txt_tokens, mel2ph, spk_embed_id, ref_mels, f0, uv, energy, skip_decoder, spk_embed_dur_id, spk_embed_f0_id, infer, is_slur, **kwarg)
     53                 spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, is_slur=None, **kwarg):
     54         B, T = txt_tokens.shape
---> 55         dur = mel2ph_to_dur(mel2ph, T).float()
     56         dur_embed = self.dur_embed(dur[:, :, None])
     57         encoder_out = self.encoder(txt_tokens, dur_embed)

[/content/DiffSinger/modules/fastspeech/tts_modules.py](https://localhost:8080/#) in mel2ph_to_dur(mel2ph, T_txt, max_dur)
    241 
    242 def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
--> 243     B, _ = mel2ph.shape
    244     dur = mel2ph.new_zeros(B, T_txt + 1).scatter_add(1, mel2ph, torch.ones_like(mel2ph))
    245     dur = dur[:, 1:]

AttributeError: 'NoneType' object has no attribute 'shape'
SEARCH STACK OVERFLOW
yqzhishen commented 1 year ago

This model must be run with DiffSingerCascadeInfer instead of DiffSingerE2eInfer. In addition, MIDI-less mode models cannot run without explicit phone durations and f0 sequence inputs. Please refer to main.py and samples/*.ds to infer from a file. Your input format is also deprecated in this forked repository.