p0p4k / pflowtts_pytorch

Unofficial implementation of NVIDIA P-Flow TTS paper
https://neurips.cc/virtual/2023/poster/69899
MIT License
210 stars 30 forks source link

LJSpeech pretrained ckpt don't work #47

Open SAnsAN-9119 opened 4 weeks ago

SAnsAN-9119 commented 4 weeks ago

I am trying to run the synthesis.ipynb using GDrive Link , but I get:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[28], line 8
      5     return model
      6 count_params = lambda x: f"{sum(p.numel() for p in x.parameters()):,}"
----> 8 model = load_model(PFLOW_CHECKPOINT)
      9 print(f"Model loaded! Parameter count: {count_params(model)}")

Cell In[28], line 3, in load_model(checkpoint_path)
      1 def load_model(checkpoint_path):
      2     print(checkpoint_path)
----> 3     model = pflowTTS.load_from_checkpoint(checkpoint_path, map_location=device)
      4     model.eval()
      5     return model

File ~/anaconda3/envs/pflowtts/lib/python3.10/site-packages/lightning/pytorch/utilities/model_helpers.py:125, in _restricted_classmethod_impl.__get__.<locals>.wrapper(*args, **kwargs)
    120 if instance is not None and not is_scripting:
    121     raise TypeError(
    122         f"The classmethod `{cls.__name__}.{self.method.__name__}` cannot be called on an instance."
    123         " Please call it on the class type and make sure the return value is used."
    124     )
--> 125 return self.method(cls, *args, **kwargs)

File ~/anaconda3/envs/pflowtts/lib/python3.10/site-packages/lightning/pytorch/core/module.py:1582, in LightningModule.load_from_checkpoint(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)
   1493 @_restricted_classmethod
   1494 def load_from_checkpoint(
   1495     cls,
   (...)
   1500     **kwargs: Any,
   1501 ) -> Self:
   1502     r"""Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint it stores the arguments
   1503     passed to ``__init__``  in the checkpoint under ``"hyper_parameters"``.
   1504 
   (...)
   1580 
   1581     """
-> 1582     loaded = _load_from_checkpoint(
   1583         cls,
   1584         checkpoint_path,
   1585         map_location,
   1586         hparams_file,
   1587         strict,
   1588         **kwargs,
   1589     )
   1590     return cast(Self, loaded)

File ~/anaconda3/envs/pflowtts/lib/python3.10/site-packages/lightning/pytorch/core/saving.py:91, in _load_from_checkpoint(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)
     89     return _load_state(cls, checkpoint, **kwargs)
     90 if issubclass(cls, pl.LightningModule):
---> 91     model = _load_state(cls, checkpoint, strict=strict, **kwargs)
     92     state_dict = checkpoint["state_dict"]
     93     if not state_dict:

File ~/anaconda3/envs/pflowtts/lib/python3.10/site-packages/lightning/pytorch/core/saving.py:187, in _load_state(cls, checkpoint, strict, **cls_kwargs_new)
    184     obj.on_load_checkpoint(checkpoint)
    186 # load the state_dict on the model automatically
--> 187 keys = obj.load_state_dict(checkpoint["state_dict"], strict=strict)
    189 if not strict:
    190     if keys.missing_keys:

File ~/anaconda3/envs/pflowtts/lib/python3.10/site-packages/torch/nn/modules/module.py:2215, in Module.load_state_dict(self, state_dict, strict, assign)
   2210         error_msgs.insert(
   2211             0, 'Missing key(s) in state_dict: {}. '.format(
   2212                 ', '.join(f'"{k}"' for k in missing_keys)))
   2214 if len(error_msgs) > 0:
-> 2215     raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
   2216                        self.__class__.__name__, "\n\t".join(error_msgs)))
   2217 return _IncompatibleKeys(missing_keys, unexpected_keys)

RuntimeError: Error(s) in loading state_dict for pflowTTS:
    Unexpected key(s) in state_dict: "encoder.speech_prompt_encoder.attn_layers.0.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.0.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.0.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.0.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.0.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.0.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.0.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.0.conv_o.bias", "encoder.speech_prompt_encoder.attn_layers.1.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.1.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.1.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.1.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.1.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.1.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.1.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.1.conv_o.bias", "encoder.speech_prompt_encoder.attn_layers.2.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.2.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.2.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.2.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.2.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.2.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.2.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.2.conv_o.bias", "encoder.speech_prompt_encoder.attn_layers.3.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.3.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.3.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.3.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.3.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.3.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.3.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.3.conv_o.bias", "encoder.speech_prompt_encoder.attn_layers.4.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.4.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.4.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.4.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.4.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.4.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.4.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.4.conv_o.bias", "encoder.speech_prompt_encoder.attn_layers.5.conv_q.weight", "encoder.speech_prompt_encoder.attn_layers.5.conv_q.bias", "encoder.speech_prompt_encoder.attn_layers.5.conv_k.weight", "encoder.speech_prompt_encoder.attn_layers.5.conv_k.bias", "encoder.speech_prompt_encoder.attn_layers.5.conv_v.weight", "encoder.speech_prompt_encoder.attn_layers.5.conv_v.bias", "encoder.speech_prompt_encoder.attn_layers.5.conv_o.weight", "encoder.speech_prompt_encoder.attn_layers.5.conv_o.bias", "encoder.speech_prompt_encoder.norm_layers_1.0.gamma", "encoder.speech_prompt_encoder.norm_layers_1.0.beta", "encoder.speech_prompt_encoder.norm_layers_1.1.gamma", "encoder.speech_prompt_encoder.norm_layers_1.1.beta", "encoder.speech_prompt_encoder.norm_layers_1.2.gamma", "encoder.speech_prompt_encoder.norm_layers_1.2.beta", "encoder.speech_prompt_encoder.norm_layers_1.3.gamma", "encoder.speech_prompt_encoder.norm_layers_1.3.beta", "encoder.speech_prompt_encoder.norm_layers_1.4.gamma", "encoder.speech_prompt_encoder.norm_layers_1.4.beta", "encoder.speech_prompt_encoder.norm_layers_1.5.gamma", "encoder.speech_prompt_encoder.norm_layers_1.5.beta", "encoder.speech_prompt_encoder.ffn_layers.0.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.0.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.0.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.0.conv_2.bias", "encoder.speech_prompt_encoder.ffn_layers.1.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.1.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.1.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.1.conv_2.bias", "encoder.speech_prompt_encoder.ffn_layers.2.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.2.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.2.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.2.conv_2.bias", "encoder.speech_prompt_encoder.ffn_layers.3.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.3.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.3.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.3.conv_2.bias", "encoder.speech_prompt_encoder.ffn_layers.4.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.4.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.4.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.4.conv_2.bias", "encoder.speech_prompt_encoder.ffn_layers.5.conv_1.weight", "encoder.speech_prompt_encoder.ffn_layers.5.conv_1.bias", "encoder.speech_prompt_encoder.ffn_layers.5.conv_2.weight", "encoder.speech_prompt_encoder.ffn_layers.5.conv_2.bias", "encoder.speech_prompt_encoder.norm_layers_2.0.gamma", "encoder.speech_prompt_encoder.norm_layers_2.0.beta", "encoder.speech_prompt_encoder.norm_layers_2.1.gamma", "encoder.speech_prompt_encoder.norm_layers_2.1.beta", "encoder.speech_prompt_encoder.norm_layers_2.2.gamma", "encoder.speech_prompt_encoder.norm_layers_2.2.beta", "encoder.speech_prompt_encoder.norm_layers_2.3.gamma", "encoder.speech_prompt_encoder.norm_layers_2.3.beta", "encoder.speech_prompt_encoder.norm_layers_2.4.gamma", "encoder.speech_prompt_encoder.norm_layers_2.4.beta", "encoder.speech_prompt_encoder.norm_layers_2.5.gamma", "encoder.speech_prompt_encoder.norm_layers_2.5.beta", "encoder.decoder.self_attn_layers.0.conv_q.weight", "encoder.decoder.self_attn_layers.0.conv_q.bias", "encoder.decoder.self_attn_layers.0.conv_k.weight", "encoder.decoder.self_attn_layers.0.conv_k.bias", "encoder.decoder.self_attn_layers.0.conv_v.weight", "encoder.decoder.self_attn_layers.0.conv_v.bias", "encoder.decoder.self_attn_layers.0.conv_o.weight", "encoder.decoder.self_attn_layers.0.conv_o.bias", "encoder.decoder.self_attn_layers.1.conv_q.weight", "encoder.decoder.self_attn_layers.1.conv_q.bias", "encoder.decoder.self_attn_layers.1.conv_k.weight", "encoder.decoder.self_attn_layers.1.conv_k.bias", "encoder.decoder.self_attn_layers.1.conv_v.weight", "encoder.decoder.self_attn_layers.1.conv_v.bias", "encoder.decoder.self_attn_layers.1.conv_o.weight", "encoder.decoder.self_attn_layers.1.conv_o.bias", "encoder.decoder.self_attn_layers.2.conv_q.weight", "encoder.decoder.self_attn_layers.2.conv_q.bias", "encoder.decoder.self_attn_layers.2.conv_k.weight", "encoder.decoder.self_attn_layers.2.conv_k.bias", "encoder.decoder.self_attn_layers.2.conv_v.weight", "encoder.decoder.self_attn_layers.2.conv_v.bias", "encoder.decoder.self_attn_layers.2.conv_o.weight", "encoder.decoder.self_attn_layers.2.conv_o.bias", "encoder.decoder.self_attn_layers.3.conv_q.weight", "encoder.decoder.self_attn_layers.3.conv_q.bias", "encoder.decoder.self_attn_layers.3.conv_k.weight", "encoder.decoder.self_attn_layers.3.conv_k.bias", "encoder.decoder.self_attn_layers.3.conv_v.weight", "encoder.decoder.self_attn_layers.3.conv_v.bias", "encoder.decoder.self_attn_layers.3.conv_o.weight", "encoder.decoder.self_attn_layers.3.conv_o.bias", "encoder.decoder.self_attn_layers.4.conv_q.weight", "encoder.decoder.self_attn_layers.4.conv_q.bias", "encoder.decoder.self_attn_layers.4.conv_k.weight", "encoder.decoder.self_attn_layers.4.conv_k.bias", "encoder.decoder.self_attn_layers.4.conv_v.weight", "encoder.decoder.self_attn_layers.4.conv_v.bias", "encoder.decoder.self_attn_layers.4.conv_o.weight", "encoder.decoder.self_attn_layers.4.conv_o.bias", "encoder.decoder.self_attn_layers.5.conv_q.weight", "encoder.decoder.self_attn_layers.5.conv_q.bias", "encoder.decoder.self_attn_layers.5.conv_k.weight", "encoder.decoder.self_attn_layers.5.conv_k.bias", "encoder.decoder.self_attn_layers.5.conv_v.weight", "encoder.decoder.self_attn_layers.5.conv_v.bias", "encoder.decoder.self_attn_layers.5.conv_o.weight", "encoder.decoder.self_attn_layers.5.conv_o.bias", "encoder.decoder.norm_layers_0.0.gamma", "encoder.decoder.norm_layers_0.0.beta", "encoder.decoder.norm_layers_0.1.gamma", "encoder.decoder.norm_layers_0.1.beta", "encoder.decoder.norm_layers_0.2.gamma", "encoder.decoder.norm_layers_0.2.beta", "encoder.decoder.norm_layers_0.3.gamma", "encoder.decoder.norm_layers_0.3.beta", "encoder.decoder.norm_layers_0.4.gamma", "encoder.decoder.norm_layers_0.4.beta", "encoder.decoder.norm_layers_0.5.gamma", "encoder.decoder.norm_layers_0.5.beta", "encoder.decoder.encdec_attn_layers.0.conv_q.weight", "encoder.decoder.encdec_attn_layers.0.conv_q.bias", "encoder.decoder.encdec_attn_layers.0.conv_k.weight", "encoder.decoder.encdec_attn_layers.0.conv_k.bias", "encoder.decoder.encdec_attn_layers.0.conv_v.weight", "encoder.decoder.encdec_attn_layers.0.conv_v.bias", "encoder.decoder.encdec_attn_layers.0.conv_o.weight", "encoder.decoder.encdec_attn_layers.0.conv_o.bias", "encoder.decoder.encdec_attn_layers.1.conv_q.weight", "encoder.decoder.encdec_attn_layers.1.conv_q.bias", "encoder.decoder.encdec_attn_layers.1.conv_k.weight", "encoder.decoder.encdec_attn_layers.1.conv_k.bias", "encoder.decoder.encdec_attn_layers.1.conv_v.weight", "encoder.decoder.encdec_attn_layers.1.conv_v.bias", "encoder.decoder.encdec_attn_layers.1.conv_o.weight", "encoder.decoder.encdec_attn_layers.1.conv_o.bias", "encoder.decoder.encdec_attn_layers.2.conv_q.weight", "encoder.decoder.encdec_attn_layers.2.conv_q.bias", "encoder.decoder.encdec_attn_layers.2.conv_k.weight", "encoder.decoder.encdec_attn_layers.2.conv_k.bias", "encoder.decoder.encdec_attn_layers.2.conv_v.weight", "encoder.decoder.encdec_attn_layers.2.conv_v.bias", "encoder.decoder.encdec_attn_layers.2.conv_o.weight", "encoder.decoder.encdec_attn_layers.2.conv_o.bias", "encoder.decoder.encdec_attn_layers.3.conv_q.weight", "encoder.decoder.encdec_attn_layers.3.conv_q.bias", "encoder.decoder.encdec_attn_layers.3.conv_k.weight", "encoder.decoder.encdec_attn_layers.3.conv_k.bias", "encoder.decoder.encdec_attn_layers.3.conv_v.weight", "encoder.decoder.encdec_attn_layers.3.conv_v.bias", "encoder.decoder.encdec_attn_layers.3.conv_o.weight", "encoder.decoder.encdec_attn_layers.3.conv_o.bias", "encoder.decoder.encdec_attn_layers.4.conv_q.weight", "encoder.decoder.encdec_attn_layers.4.conv_q.bias", "encoder.decoder.encdec_attn_layers.4.conv_k.weight", "encoder.decoder.encdec_attn_layers.4.conv_k.bias", "encoder.decoder.encdec_attn_layers.4.conv_v.weight", "encoder.decoder.encdec_attn_layers.4.conv_v.bias", "encoder.decoder.encdec_attn_layers.4.conv_o.weight", "encoder.decoder.encdec_attn_layers.4.conv_o.bias", "encoder.decoder.encdec_attn_layers.5.conv_q.weight", "encoder.decoder.encdec_attn_layers.5.conv_q.bias", "encoder.decoder.encdec_attn_layers.5.conv_k.weight", "encoder.decoder.encdec_attn_layers.5.conv_k.bias", "encoder.decoder.encdec_attn_layers.5.conv_v.weight", "encoder.decoder.encdec_attn_layers.5.conv_v.bias", "encoder.decoder.encdec_attn_layers.5.conv_o.weight", "encoder.decoder.encdec_attn_layers.5.conv_o.bias", "encoder.decoder.norm_layers_1.0.gamma", "encoder.decoder.norm_layers_1.0.beta", "encoder.decoder.norm_layers_1.1.gamma", "encoder.decoder.norm_layers_1.1.beta", "encoder.decoder.norm_layers_1.2.gamma", "encoder.decoder.norm_layers_1.2.beta", "encoder.decoder.norm_layers_1.3.gamma", "encoder.decoder.norm_layers_1.3.beta", "encoder.decoder.norm_layers_1.4.gamma", "encoder.decoder.norm_layers_1.4.beta", "encoder.decoder.norm_layers_1.5.gamma", "encoder.decoder.norm_layers_1.5.beta", "encoder.decoder.ffn_layers.0.conv_1.weight", "encoder.decoder.ffn_layers.0.conv_1.bias", "encoder.decoder.ffn_layers.0.conv_2.weight", "encoder.decoder.ffn_layers.0.conv_2.bias", "encoder.decoder.ffn_layers.1.conv_1.weight", "encoder.decoder.ffn_layers.1.conv_1.bias", "encoder.decoder.ffn_layers.1.conv_2.weight", "encoder.decoder.ffn_layers.1.conv_2.bias", "encoder.decoder.ffn_layers.2.conv_1.weight", "encoder.decoder.ffn_layers.2.conv_1.bias", "encoder.decoder.ffn_layers.2.conv_2.weight", "encoder.decoder.ffn_layers.2.conv_2.bias", "encoder.decoder.ffn_layers.3.conv_1.weight", "encoder.decoder.ffn_layers.3.conv_1.bias", "encoder.decoder.ffn_layers.3.conv_2.weight", "encoder.decoder.ffn_layers.3.conv_2.bias", "encoder.decoder.ffn_layers.4.conv_1.weight", "encoder.decoder.ffn_layers.4.conv_1.bias", "encoder.decoder.ffn_layers.4.conv_2.weight", "encoder.decoder.ffn_layers.4.conv_2.bias", "encoder.decoder.ffn_layers.5.conv_1.weight", "encoder.decoder.ffn_layers.5.conv_1.bias", "encoder.decoder.ffn_layers.5.conv_2.weight", "encoder.decoder.ffn_layers.5.conv_2.bias", "encoder.decoder.norm_layers_2.0.gamma", "encoder.decoder.norm_layers_2.0.beta", "encoder.decoder.norm_layers_2.1.gamma", "encoder.decoder.norm_layers_2.1.beta", "encoder.decoder.norm_layers_2.2.gamma", "encoder.decoder.norm_layers_2.2.beta", "encoder.decoder.norm_layers_2.3.gamma", "encoder.decoder.norm_layers_2.3.beta", "encoder.decoder.norm_layers_2.4.gamma", "encoder.decoder.norm_layers_2.4.beta", "encoder.decoder.norm_layers_2.5.gamma", "encoder.decoder.norm_layers_2.5.beta".
patriotyk commented 3 weeks ago

It is because model has been changed since trained. You need to run pretrained model on older commit. Not sure, but I think this one should work b2f7d130470bce6a85ea1f4e2cb454cdc8ae9f55

SAnsAN-9119 commented 3 weeks ago

Yes, thanks, I've already been able to run the generation this way. Does this mean that there have been significant changes in the models used in the latest versions of this repository? Maybe there are new pre-trained models suitable for the current version of the repository?

patriotyk commented 3 weeks ago

No there aren't pretrained checkpoints with new changes.