Colab RuntimeError: cuDNN error: CUDNN_STATUS_VERSION_MISMATCH

josuebatista commented 10 months ago

I was able to run the notebook in Google Colab with no issues until 2023-09-15. Attempted to run on 2023-09-28 and started getting this error message. I am running Google Colab Pro with Tesla V100-SXM2-16GB. Thanks in advance for your assistance.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
[<ipython-input-10-8cafa8c83657>](https://localhost:8080/#) in <cell line: 2>()
      1 # Initialize NeMo MSDD diarization model
----> 2 msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to("cuda")
      3 msdd_model.diarize()
      4 
      5 del msdd_model

12 frames
[/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/msdd_models.py](https://localhost:8080/#) in __init__(self, cfg)
    989         )
    990 
--> 991         self._init_msdd_model(cfg)
    992         self.diar_window_length = cfg.diarizer.msdd_model.parameters.diar_window_length
    993         self.msdd_model.cfg = self.transfer_diar_params_to_model_params(self.msdd_model, cfg)

[/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/msdd_models.py](https://localhost:8080/#) in _init_msdd_model(self, cfg)
   1091                 logging.warning(f"requested {model_path} model name not available in pretrained models, instead")
   1092             logging.info("Loading pretrained {} model from NGC".format(model_path))
-> 1093             self.msdd_model = EncDecDiarLabelModel.from_pretrained(model_name=model_path, map_location=cfg.device)
   1094         # Load speaker embedding model state_dict which is loaded from the MSDD checkpoint.
   1095         if self.use_speaker_model_from_ckpt:

[/usr/local/lib/python3.10/dist-packages/nemo/core/classes/common.py](https://localhost:8080/#) in from_pretrained(cls, model_name, refresh_cache, override_config_path, map_location, strict, return_config, trainer, save_restore_connector)
    850             )
    851 
--> 852         instance = class_.restore_from(
    853             restore_path=nemo_model_file_in_cache,
    854             override_config_path=override_config_path,

[/usr/local/lib/python3.10/dist-packages/nemo/core/classes/modelPT.py](https://localhost:8080/#) in restore_from(cls, restore_path, override_config_path, map_location, strict, return_config, save_restore_connector, trainer)
    433 
    434         cls.update_save_restore_connector(save_restore_connector)
--> 435         instance = cls._save_restore_connector.restore_from(
    436             cls, restore_path, override_config_path, map_location, strict, return_config, trainer
    437         )

[/usr/local/lib/python3.10/dist-packages/nemo/core/connectors/save_restore_connector.py](https://localhost:8080/#) in restore_from(self, calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer)
    239         # Get path where the command is executed - the artifacts will be "retrieved" there
    240         # (original .nemo behavior)
--> 241         loaded_params = self.load_config_and_state_dict(
    242             calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer,
    243         )

[/usr/local/lib/python3.10/dist-packages/nemo/core/connectors/save_restore_connector.py](https://localhost:8080/#) in load_config_and_state_dict(self, calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer)
    163                 calling_cls._set_model_restore_state(is_being_restored=True, folder=tmpdir)
    164                 instance = calling_cls.from_config_dict(config=conf, trainer=trainer)
--> 165                 instance = instance.to(map_location)
    166                 # add load_state_dict override
    167                 if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:

[/usr/local/lib/python3.10/dist-packages/lightning_fabric/utilities/device_dtype_mixin.py](https://localhost:8080/#) in to(self, *args, **kwargs)
     52         device, dtype = torch._C._nn._parse_to(*args, **kwargs)[:2]
     53         self.__update_properties(device=device, dtype=dtype)
---> 54         return super().to(*args, **kwargs)
     55 
     56     def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self:  # type: ignore[valid-type]

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in to(self, *args, **kwargs)
   1143             return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
   1144 
-> 1145         return self._apply(convert)
   1146 
   1147     def register_full_backward_pre_hook(

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _apply(self, fn)
    795     def _apply(self, fn):
    796         for module in self.children():
--> 797             module._apply(fn)
    798 
    799         def compute_should_use_set_data(tensor, tensor_applied):

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _apply(self, fn)
    795     def _apply(self, fn):
    796         for module in self.children():
--> 797             module._apply(fn)
    798 
    799         def compute_should_use_set_data(tensor, tensor_applied):

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/rnn.py](https://localhost:8080/#) in _apply(self, fn)
    200         # Note: be v. careful before removing this, as 3rd party device types
    201         # likely rely on this behavior to properly .to() modules like LSTM.
--> 202         self._init_flat_weights()
    203 
    204         return ret

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/rnn.py](https://localhost:8080/#) in _init_flat_weights(self)
    137         self._flat_weight_refs = [weakref.ref(w) if w is not None else None
    138                                   for w in self._flat_weights]
--> 139         self.flatten_parameters()
    140 
    141     def __setattr__(self, attr, value):

[/usr/local/lib/python3.10/dist-packages/torch/nn/modules/rnn.py](https://localhost:8080/#) in flatten_parameters(self)
    188                     if self.proj_size > 0:
    189                         num_weights += 1
--> 190                     torch._cudnn_rnn_flatten_weight(
    191                         self._flat_weights, num_weights,
    192                         self.input_size, rnn.get_cudnn_mode(self.mode),

RuntimeError: cuDNN error: CUDNN_STATUS_VERSION_MISMATCH

fartoot commented 9 months ago

same problem !

pablo-fe commented 9 months ago

Same here

mrgalindo commented 9 months ago

same here!

angieKang commented 9 months ago

same problem!

MahmoudAshraf97 commented 9 months ago

Notebook updated and problem solved

MahmoudAshraf97 / whisper-diarization

Colab RuntimeError: cuDNN error: CUDNN_STATUS_VERSION_MISMATCH #93