NVIDIA / NeMo

A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)
https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html
Apache License 2.0
12.01k stars 2.5k forks source link

Error when training ASR with Transducers on GPU #4863

Closed stalevna closed 2 years ago

stalevna commented 2 years ago

image

when training ASR with Transducers and leaving

freq_masks: 2 time_masks: 10

instead of zeroing them out an error occurs

image TypeError Traceback (most recent call last) in 1 # Train the model ----> 2 trainer.fit(model)

42 frames /usr/local/lib/python3.7/dist-packages/numba/cuda/dispatcher.py in getitem(self, args) 567 if len(args) not in [2, 3, 4]: 568 raise ValueError('must specify at least the griddim and blockdim') --> 569 return self.configure(*args) 570 571 def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):

TypeError: unhashable type: 'list'

It happens both in google colab and when training on local machine. To reproduce this I skipped

config.model.spec_augment.freq_masks = 0 config.model.spec_augment.time_masks = 0

in colab tutorial ASR_with_Transducers

if I zero spec_augment everything works. Could you check if it's a possible bug or it is something I do wrong

titu1994 commented 2 years ago

Can you expand the entire stack frame and paste the entire error here.

It's a Numba error when calculating the loss, it has probably little to do with spec augment

stalevna commented 2 years ago

Can you expand the entire stack frame and paste the entire error here.

It's a Numba error when calculating the loss, it has probably little to do with spec augment

TypeError Traceback (most recent call last) in 1 # Train the model ----> 2 trainer.fit(model)

42 frames /usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 769 self.strategy.model = model 770 self._call_and_handle_interrupt( --> 771 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 772 ) 773

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, kwargs) 721 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, *kwargs) 722 else: --> 723 return trainer_fn(args, kwargs) 724 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7 725 except KeyboardInterrupt as exception:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 809 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None 810 ) --> 811 results = self._run(model, ckpt_path=self.ckpt_path) 812 813 assert self.state.stopped

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path) 1234 self._checkpoint_connector.resume_end() 1235 -> 1236 results = self._run_stage() 1237 1238 log.detail(f"{self.class.name}: trainer tearing down")

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_stage(self) 1321 if self.predicting: 1322 return self._run_predict() -> 1323 return self._run_train() 1324 1325 def _pre_training_routine(self):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_train(self) 1351 self.fit_loop.trainer = self 1352 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1353 self.fit_loop.run() 1354 1355 def _run_evaluate(self) -> _EVALUATE_OUTPUT:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/fit_loop.py in advance(self) 264 ) 265 with self.trainer.profiler.profile("run_training_epoch"): --> 266 self._outputs = self.epoch_loop.run(self._data_fetcher) 267 268 def on_advance_end(self) -> None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py in advance(self, data_fetcher) 206 207 with self.trainer.profiler.profile("run_training_batch"): --> 208 batch_output = self.batch_loop.run(batch, batch_idx) 209 210 self.batch_progress.increment_processed()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/batch/training_batch_loop.py in advance(self, batch, batch_idx) 86 if self.trainer.lightning_module.automatic_optimization: 87 optimizers = _get_active_optimizers(self.trainer.optimizers, self.trainer.optimizer_frequencies, batch_idx) ---> 88 outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx) 89 else: 90 outputs = self.manual_loop.run(split_batch, batch_idx)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in advance(self, batch, *args, **kwargs) 205 self._batch_idx, 206 self._optimizers[self.optim_progress.optimizer_position], --> 207 self.optimizer_idx, 208 ) 209 if result.loss is not None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _run_optimization(self, split_batch, batch_idx, optimizer, opt_idx) 254 # gradient update with accumulated gradients 255 else: --> 256 self._optimizer_step(optimizer, opt_idx, batch_idx, closure) 257 258 result = closure.consume_result()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure) 376 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator), 377 using_native_amp=(self.trainer.amp_backend == AMPType.NATIVE), --> 378 using_lbfgs=is_lbfgs, 379 ) 380

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_lightning_module_hook(self, hook_name, pl_module, *args, *kwargs) 1593 1594 with self.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"): -> 1595 output = fn(args, **kwargs) 1596 1597 # restore current_fx when nested context

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs) 1644 1645 """ -> 1646 optimizer.step(closure=optimizer_closure) 1647 1648 def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/optimizer.py in step(self, closure, kwargs) 166 167 assert self._strategy is not None --> 168 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, kwargs) 169 170 self._on_after_step()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/strategies/strategy.py in optimizer_step(self, optimizer, opt_idx, closure, model, kwargs) 191 """ 192 model = model or self.lightning_module --> 193 return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, kwargs) 194 195 def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py in optimizer_step(self, model, optimizer, optimizer_idx, closure, kwargs) 153 if isinstance(model, pl.LightningModule): 154 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure) --> 155 return optimizer.step(closure=closure, kwargs) 156 157 def _track_grad_norm(self, trainer: "pl.Trainer") -> None:

/usr/local/lib/python3.7/dist-packages/torch/optim/lr_scheduler.py in wrapper(*args, *kwargs) 63 instance._step_count += 1 64 wrapped = func.get(instance, cls) ---> 65 return wrapped(args, **kwargs) 66 67 # Note that the returned function here is no longer a bound method,

/usr/local/lib/python3.7/dist-packages/torch/optim/optimizer.py in wrapper(*args, *kwargs) 111 profile_name = "Optimizer.step#{}.step".format(obj.class.name) 112 with torch.autograd.profiler.record_function(profile_name): --> 113 return func(args, **kwargs) 114 return wrapper 115

/usr/local/lib/python3.7/dist-packages/nemo/core/optim/novograd.py in step(self, closure) 81 loss = None 82 if closure is not None: ---> 83 loss = closure() 84 85 for group in self.param_groups:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py in _wrap_closure(self, model, optimizer, optimizer_idx, closure) 138 consistent with the PrecisionPlugin subclasses that cannot pass optimizer.step(closure) directly. 139 """ --> 140 closure_result = closure() 141 self._after_closure(model, optimizer, optimizer_idx) 142 return closure_result

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in call(self, *args, kwargs) 146 147 def call(self, *args: Any, *kwargs: Any) -> Optional[Tensor]: --> 148 self._result = self.closure(args, kwargs) 149 return self._result.loss 150

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in closure(self, *args, *kwargs) 132 133 def closure(self, args: Any, **kwargs: Any) -> ClosureResult: --> 134 step_output = self._step_fn() 135 136 if step_output.closure_loss is None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _training_step(self, split_batch, batch_idx, opt_idx) 425 426 # manually capture logged metrics --> 427 training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values()) 428 self.trainer.strategy.post_training_step() 429

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_strategy_hook(self, hook_name, *args, *kwargs) 1763 1764 with self.profiler.profile(f"[Strategy]{self.strategy.class.name}.{hook_name}"): -> 1765 output = fn(args, **kwargs) 1766 1767 # restore current_fx when nested context

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/strategies/strategy.py in training_step(self, *args, *kwargs) 331 """ 332 with self.precision_plugin.train_step_context(): --> 333 return self.model.training_step(args, **kwargs) 334 335 def post_training_step(self):

/usr/local/lib/python3.7/dist-packages/nemo/utils/model_utils.py in wrap_training_step(wrapped, instance, args, kwargs) 362 @wrapt.decorator 363 def wrap_training_step(wrapped, instance: 'pl.LightningModule', args, kwargs): --> 364 output_dict = wrapped(*args, **kwargs) 365 366 if isinstance(output_dict, dict) and output_dict is not None and 'log' in output_dict:

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/models/rnnt_models.py in training_step(self, batch, batch_nb) 672 encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) 673 else: --> 674 encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) 675 del signal 676

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/models/rnnt_models.py in forward(self, input_signal, input_signal_length, processed_signal, processed_signal_length) 655 # Spec augment is not applied during evaluation/testing 656 if self.spec_augmentation is not None and self.training: --> 657 processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) 658 659 encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, *kwargs) 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/modules/audio_preprocessing.py in forward(self, input_spec, length) 508 # tensor must be on GPU and length must be provided 509 if self.spec_augment_numba is not None and spec_augment_launch_heuristics(augmented_spec, length): --> 510 augmented_spec = self.spec_augment_numba(input_spec=augmented_spec, length=length) 511 else: 512 augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length)

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, *kwargs) 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/torch/autograd/grad_mode.py in decorate_context(*args, kwargs) 25 def decorate_context(*args, *kwargs): 26 with self.clone(): ---> 27 return func(args, kwargs) 28 return cast(F, decorate_context) 29

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in forward(self, input_spec, length) 300 freq_masks=self.freq_masks, 301 time_masks=self.time_masks, --> 302 mask_value=self.mask_value, 303 ) 304

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in launch_spec_augment_kernel(x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, freq_masks, time_masks, mask_value) 167 168 # Launch CUDA kernel --> 169 spec_augment_kernel[blocks_per_grid, threads_per_block, stream, 0]( 170 x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, mask_value 171 )

/usr/local/lib/python3.7/dist-packages/numba/cuda/dispatcher.py in getitem(self, args) 567 if len(args) not in [2, 3, 4]: 568 raise ValueError('must specify at least the griddim and blockdim') --> 569 return self.configure(*args) 570 571 def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):

TypeError: unhashable type: 'list'

titu1994 commented 2 years ago

Can you print out the system details - pytorch, Nemo, Numba versions.

Seems to be the Numba kernels call for spec augment, but there shouldn't be a reason for it to crash like this.


/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in forward(self, input_spec, length)
300 freq_masks=self.freq_masks,
301 time_masks=self.time_masks,
--> 302 mask_value=self.mask_value,
303 )
304

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in launch_spec_augment_kernel(x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, freq_masks, time_masks, mask_value)
167
168 # Launch CUDA kernel
--> 169 spec_augment_kernel[blocks_per_grid, threads_per_block, stream, 0](
170 x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, mask_value
171 )
stalevna commented 2 years ago

Can you print out the system details - pytorch, Nemo, Numba versions.

Seems to be the Numba kernels call for spec augment, but there shouldn't be a reason for it to crash like this.


/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in forward(self, input_spec, length)
300 freq_masks=self.freq_masks,
301 time_masks=self.time_masks,
--> 302 mask_value=self.mask_value,
303 )
304

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in launch_spec_augment_kernel(x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, freq_masks, time_masks, mask_value)
167
168 # Launch CUDA kernel
--> 169 spec_augment_kernel[blocks_per_grid, threads_per_block, stream, 0](
170 x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, mask_value
171 )

This pip freeze I run in the google colab

absl-py==1.2.0 aeppl==0.0.33 aesara==2.7.9 aiohttp==3.8.1 aiosignal==1.2.0 alabaster==0.7.12 albumentations==1.2.1 altair==4.2.0 appdirs==1.4.4 arviz==0.12.1 astor==0.8.1 astropy==4.3.1 astunparse==1.6.3 async-timeout==4.0.2 asynctest==0.13.0 atari-py==0.2.9 atomicwrites==1.4.1 attrs==22.1.0 audioread==3.0.0 autograd==1.4 Babel==2.10.3 backcall==0.2.0 beautifulsoup4==4.6.3 bleach==5.0.1 blis==0.7.8 bokeh==2.3.3 branca==0.5.0 bs4==0.0.1 CacheControl==0.12.11 cached-property==1.5.2 cachetools==4.2.4 catalogue==2.0.8 certifi==2022.6.15 cffi==1.15.1 cftime==1.6.1 chardet==3.0.4 charset-normalizer==2.1.1 click==7.1.2 clikit==0.6.2 cloudpickle==1.5.0 cmake==3.22.6 cmdstanpy==1.0.7 colorcet==3.0.0 colorlover==0.3.0 community==1.0.0b1 cons==0.4.5 contextlib2==0.5.5 convertdate==2.4.0 crashtest==0.3.1 crcmod==1.7 cufflinks==0.17.3 cupy-cuda111==9.4.0 cvxopt==1.3.0 cvxpy==1.2.1 cycler==0.11.0 cymem==2.0.6 Cython==0.29.32 daft==0.0.4 dask==2022.2.0 datascience==0.17.5 debugpy==1.0.0 decorator==4.4.2 defusedxml==0.7.1 descartes==1.1.0 dill==0.3.5.1 distributed==2022.2.0 dlib==19.24.0 dm-tree==0.1.7 docutils==0.17.1 dopamine-rl==1.0.5 earthengine-api==0.1.321 easydict==1.9 ecos==2.0.10 editdistance==0.5.3 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl entrypoints==0.4 ephem==4.1.3 et-xmlfile==1.1.0 etils==0.7.1 etuples==0.3.5 fa2==0.3.5 fastai==2.7.9 fastcore==1.5.22 fastdownload==0.0.7 fastdtw==0.3.4 fastjsonschema==2.16.1 fastprogress==1.0.3 fastrlock==0.8 feather-format==0.4.1 filelock==3.8.0 firebase-admin==4.4.0 fix-yahoo-finance==0.0.22 Flask==1.1.4 flatbuffers==2.0.7 folium==0.12.1.post1 frozenlist==1.3.1 fsspec==2022.7.1 future==0.16.0 gast==0.5.3 GDAL==2.2.2 gdown==4.4.0 gensim==3.6.0 geographiclib==1.52 geopy==1.17.0 gin-config==0.5.0 glob2==0.7 google==2.0.3 google-api-core==1.31.6 google-api-python-client==1.12.11 google-auth==1.35.0 google-auth-httplib2==0.0.4 google-auth-oauthlib==0.4.6 google-cloud-bigquery==1.21.0 google-cloud-bigquery-storage==1.1.2 google-cloud-core==1.0.3 google-cloud-datastore==1.8.0 google-cloud-firestore==1.7.0 google-cloud-language==1.2.0 google-cloud-storage==1.18.1 google-cloud-translate==1.5.0 google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz google-pasta==0.2.0 google-resumable-media==0.4.1 googleapis-common-protos==1.56.4 googledrivedownloader==0.4 graphviz==0.10.1 greenlet==1.1.3 grpcio==1.47.0 gspread==3.4.2 gspread-dataframe==3.0.8 gym==0.25.2 gym-notices==0.0.8 h5py==3.1.0 HeapDict==1.0.1 hijri-converter==2.2.4 holidays==0.15 holoviews==1.14.9 html5lib==1.0.1 httpimport==0.5.18 httplib2==0.17.4 httplib2shim==0.0.3 httpstan==4.6.1 humanize==0.5.1 hyperopt==0.1.2 idna==2.10 imageio==2.9.0 imagesize==1.4.1 imbalanced-learn==0.8.1 imblearn==0.0 imgaug==0.4.0 importlib-metadata==4.12.0 importlib-resources==5.9.0 imutils==0.5.4 inflect==2.1.0 intel-openmp==2022.1.0 intervaltree==2.1.0 ipykernel==5.3.4 ipython==7.9.0 ipython-genutils==0.2.0 ipython-sql==0.3.9 ipywidgets==7.7.1 itsdangerous==1.1.0 jax==0.3.14 jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.3.14+cuda11.cudnn805-cp37-none-manylinux2014_x86_64.whl jieba==0.42.1 Jinja2==2.11.3 joblib==1.1.0 jpeg4py==0.1.4 jsonschema==4.3.3 jupyter-client==6.1.12 jupyter-console==6.1.0 jupyter-core==4.11.1 jupyterlab-widgets==3.0.2 kaggle==1.5.12 kapre==0.3.7 keras==2.8.0 Keras-Preprocessing==1.1.2 keras-vis==0.4.1 kiwisolver==1.4.4 korean-lunar-calendar==0.2.1 langcodes==3.3.0 libclang==14.0.6 librosa==0.8.1 lightgbm==2.2.3 llvmlite==0.39.0 lmdb==0.99 locket==1.0.0 logical-unification==0.4.5 LunarCalendar==0.0.9 lxml==4.9.1 Markdown==3.4.1 MarkupSafe==2.0.1 marshmallow==3.17.1 matplotlib==3.2.2 matplotlib-venn==0.11.7 miniKanren==1.0.3 missingno==0.5.1 mistune==0.8.4 mizani==0.7.3 mkl==2019.0 mlxtend==0.14.0 more-itertools==8.14.0 moviepy==0.2.3.5 mpmath==1.2.1 msgpack==1.0.4 multidict==6.0.2 multipledispatch==0.6.0 multitasking==0.0.11 murmurhash==1.0.8 music21==5.5.0 natsort==5.5.0 nbconvert==5.6.1 nbformat==5.4.0 netCDF4==1.6.0 networkx==2.6.3 nibabel==3.0.2 nltk==3.7 notebook==5.3.1 numba==0.56.0 numexpr==2.8.3 numpy==1.21.6 oauth2client==4.1.3 oauthlib==3.2.0 okgrade==0.4.3 opencv-contrib-python==4.6.0.66 opencv-python==4.6.0.66 opencv-python-headless==4.6.0.66 openpyxl==3.0.10 opt-einsum==3.3.0 osqp==0.6.2.post0 packaging==21.3 palettable==3.3.0 pandas==1.3.5 pandas-datareader==0.9.0 pandas-gbq==0.13.3 pandas-profiling==1.4.1 pandocfilters==1.5.0 panel==0.12.1 param==1.12.2 parso==0.8.3 partd==1.3.0 pastel==0.2.1 pathlib==1.0.1 pathy==0.6.2 patsy==0.5.2 pep517==0.13.0 pexpect==4.8.0 pickleshare==0.7.5 Pillow==7.1.2 pip-tools==6.2.0 plotly==5.5.0 plotnine==0.8.0 pluggy==0.7.1 pooch==1.6.0 portpicker==1.3.9 prefetch-generator==1.0.1 preshed==3.0.7 prettytable==3.4.0 progressbar2==3.38.0 promise==2.3 prompt-toolkit==2.0.10 prophet==1.1 protobuf==3.17.3 psutil==5.4.8 psycopg2==2.9.3 ptyprocess==0.7.0 py==1.11.0 pyarrow==6.0.1 pyasn1==0.4.8 pyasn1-modules==0.2.8 pycocotools==2.0.4 pycparser==2.21 pyct==0.4.8 pydantic==1.9.2 pydata-google-auth==1.4.0 pydot==1.3.0 pydot-ng==2.0.0 pydotplus==2.0.2 PyDrive==1.3.1 pyemd==0.5.1 pyerfa==2.0.0.1 Pygments==2.6.1 pygobject==3.26.1 pylev==1.4.0 pymc==4.1.4 PyMeeus==0.5.11 pymongo==4.2.0 pymystem3==0.2.0 PyOpenGL==3.1.6 pyparsing==3.0.9 pyrsistent==0.18.1 pysimdjson==3.2.0 pysndfile==1.3.8 PySocks==1.7.1 pystan==3.3.0 pytest==3.6.4 python-apt==0.0.0 python-chess==0.23.11 python-dateutil==2.8.2 python-louvain==0.16 python-slugify==6.1.2 python-utils==3.3.3 pytz==2022.2.1 pyviz-comms==2.2.1 PyWavelets==1.3.0 PyYAML==6.0 pyzmq==23.2.1 qdldl==0.1.5.post2 qudida==0.0.4 regex==2022.6.2 requests==2.23.0 requests-oauthlib==1.3.1 resampy==0.4.0 rpy2==3.4.5 rsa==4.9 scikit-image==0.18.3 scikit-learn==1.0.2 scipy==1.7.3 screen-resolution-extra==0.0.0 scs==3.2.0 seaborn==0.11.2 Send2Trash==1.8.0 setuptools-git==1.2 Shapely==1.8.4 six==1.15.0 sklearn-pandas==1.8.0 smart-open==5.2.1 snowballstemmer==2.2.0 sortedcontainers==2.4.0 SoundFile==0.10.3.post1 spacy==3.4.1 spacy-legacy==3.0.10 spacy-loggers==1.0.3 Sphinx==1.8.6 sphinxcontrib-serializinghtml==1.1.5 sphinxcontrib-websupport==1.2.4 SQLAlchemy==1.4.40 sqlparse==0.4.2 srsly==2.4.4 statsmodels==0.12.2 sympy==1.7.1 tables==3.7.0 tabulate==0.8.10 tblib==1.7.0 tenacity==8.0.1 tensorboard==2.8.0 tensorboard-data-server==0.6.1 tensorboard-plugin-wit==1.8.1 tensorflow==2.8.2+zzzcolab20220719082949 tensorflow-datasets==4.6.0 tensorflow-estimator==2.8.0 tensorflow-gcs-config==2.8.0 tensorflow-hub==0.12.0 tensorflow-io-gcs-filesystem==0.26.0 tensorflow-metadata==1.10.0 tensorflow-probability==0.16.0 termcolor==1.1.0 terminado==0.13.3 testpath==0.6.0 text-unidecode==1.3 textblob==0.15.3 thinc==8.1.0 threadpoolctl==3.1.0 tifffile==2021.11.2 toml==0.10.2 tomli==2.0.1 toolz==0.12.0 torch @ https://download.pytorch.org/whl/cu113/torch-1.12.1%2Bcu113-cp37-cp37m-linux_x86_64.whl torchaudio @ https://download.pytorch.org/whl/cu113/torchaudio-0.12.1%2Bcu113-cp37-cp37m-linux_x86_64.whl torchsummary==1.5.1 torchtext==0.13.1 torchvision @ https://download.pytorch.org/whl/cu113/torchvision-0.13.1%2Bcu113-cp37-cp37m-linux_x86_64.whl tornado==5.1.1 tqdm==4.64.0 traitlets==5.1.1 tweepy==3.10.0 typeguard==2.7.1 typer==0.4.2 typing-extensions==4.1.1 tzlocal==1.5.1 ujson==5.4.0 uritemplate==3.0.1 urllib3==1.24.3 vega-datasets==0.9.0 wasabi==0.10.1 wcwidth==0.2.5 webargs==8.2.0 webencodings==0.5.1 Werkzeug==1.0.1 widgetsnbextension==3.6.1 wordcloud==1.8.2.2 wrapt==1.14.1 xarray==0.20.2 xarray-einstats==0.2.2 xgboost==0.90 xkit==0.0.0 xlrd==1.1.0 xlwt==1.3.0 yarl==1.8.1 yellowbrick==1.5 zict==2.2.0 zipp==3.8.1

image image

titu1994 commented 2 years ago

Hmm, ivr never tried it with Numba 0.56. I will try it locally later this month, I'm on vacation till the 19th.

stalevna commented 2 years ago

Hmm, ivr never tried it with Numba 0.56. I will try it locally later this month, I'm on vacation till the 19th.

Thank you so much for helping and looking into it even if you are on vacation. I will try to run it with earlier numba versions and report back :)

stalevna commented 2 years ago

checked with numba==0.53.1 everything worked

titu1994 commented 2 years ago

Interesting. Also, numba just had a release of 0.56.2 which said it had cuda function caching bugfixes. Doubtful it's your case but could you try it ?

stalevna commented 2 years ago

Interesting. Also, numba just had a release of 0.56.2 which said it had cuda function caching bugfixes. Doubtful it's your case but could you try it ? Seems like it wasn't fixed with 0.56.2 release image


TypeError                                 Traceback (most recent call last)
[<ipython-input-30-1bf927b35c34>](https://localhost:8080/#) in <module>
1 # Train the model
----> 2 trainer.fit(model)

42 frames /usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 769 self.strategy.model = model 770 self._call_and_handle_interrupt( --> 771 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path 772 ) 773

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, kwargs) 721 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, *kwargs) 722 else: --> 723 return trainer_fn(args, kwargs) 724 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7 725 except KeyboardInterrupt as exception:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path) 809 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None 810 ) --> 811 results = self._run(model, ckpt_path=self.ckpt_path) 812 813 assert self.state.stopped

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path) 1234 self._checkpoint_connector.resume_end() 1235 -> 1236 results = self._run_stage() 1237 1238 log.detail(f"{self.class.name}: trainer tearing down")

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_stage(self) 1321 if self.predicting: 1322 return self._run_predict() -> 1323 return self._run_train() 1324 1325 def _pre_training_routine(self):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_train(self) 1351 self.fit_loop.trainer = self 1352 with torch.autograd.set_detect_anomaly(self._detect_anomaly): -> 1353 self.fit_loop.run() 1354 1355 def _run_evaluate(self) -> _EVALUATE_OUTPUT:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/fit_loop.py in advance(self) 264 ) 265 with self.trainer.profiler.profile("run_training_epoch"): --> 266 self._outputs = self.epoch_loop.run(self._data_fetcher) 267 268 def on_advance_end(self) -> None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py in advance(self, data_fetcher) 206 207 with self.trainer.profiler.profile("run_training_batch"): --> 208 batch_output = self.batch_loop.run(batch, batch_idx) 209 210 self.batch_progress.increment_processed()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/batch/training_batch_loop.py in advance(self, batch, batch_idx) 86 if self.trainer.lightning_module.automatic_optimization: 87 optimizers = _get_active_optimizers(self.trainer.optimizers, self.trainer.optimizer_frequencies, batch_idx) ---> 88 outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx) 89 else: 90 outputs = self.manual_loop.run(split_batch, batch_idx)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, kwargs) 202 try: 203 self.on_advance_start(*args, *kwargs) --> 204 self.advance(args, kwargs) 205 self.on_advance_end() 206 self._restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in advance(self, batch, *args, **kwargs) 205 self._batch_idx, 206 self._optimizers[self.optim_progress.optimizer_position], --> 207 self.optimizer_idx, 208 ) 209 if result.loss is not None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _run_optimization(self, split_batch, batch_idx, optimizer, opt_idx) 254 # gradient update with accumulated gradients 255 else: --> 256 self._optimizer_step(optimizer, opt_idx, batch_idx, closure) 257 258 result = closure.consume_result()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure) 376 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator), 377 using_native_amp=(self.trainer.amp_backend == AMPType.NATIVE), --> 378 using_lbfgs=is_lbfgs, 379 ) 380

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_lightning_module_hook(self, hook_name, pl_module, *args, *kwargs) 1593 1594 with self.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"): -> 1595 output = fn(args, **kwargs) 1596 1597 # restore current_fx when nested context

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs) 1644 1645 """ -> 1646 optimizer.step(closure=optimizer_closure) 1647 1648 def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/optimizer.py in step(self, closure, kwargs) 166 167 assert self._strategy is not None --> 168 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, kwargs) 169 170 self._on_after_step()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/strategies/strategy.py in optimizer_step(self, optimizer, opt_idx, closure, model, kwargs) 191 """ 192 model = model or self.lightning_module --> 193 return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, kwargs) 194 195 def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py in optimizer_step(self, model, optimizer, optimizer_idx, closure, kwargs) 153 if isinstance(model, pl.LightningModule): 154 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure) --> 155 return optimizer.step(closure=closure, kwargs) 156 157 def _track_grad_norm(self, trainer: "pl.Trainer") -> None:

/usr/local/lib/python3.7/dist-packages/torch/optim/lr_scheduler.py in wrapper(*args, *kwargs) 63 instance._step_count += 1 64 wrapped = func.get(instance, cls) ---> 65 return wrapped(args, **kwargs) 66 67 # Note that the returned function here is no longer a bound method,

/usr/local/lib/python3.7/dist-packages/torch/optim/optimizer.py in wrapper(*args, *kwargs) 111 profile_name = "Optimizer.step#{}.step".format(obj.class.name) 112 with torch.autograd.profiler.record_function(profile_name): --> 113 return func(args, **kwargs) 114 return wrapper 115

/usr/local/lib/python3.7/dist-packages/nemo/core/optim/novograd.py in step(self, closure) 81 loss = None 82 if closure is not None: ---> 83 loss = closure() 84 85 for group in self.param_groups:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py in _wrap_closure(self, model, optimizer, optimizer_idx, closure) 138 consistent with the PrecisionPlugin subclasses that cannot pass optimizer.step(closure) directly. 139 """ --> 140 closure_result = closure() 141 self._after_closure(model, optimizer, optimizer_idx) 142 return closure_result

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in call(self, *args, kwargs) 146 147 def call(self, *args: Any, *kwargs: Any) -> Optional[Tensor]: --> 148 self._result = self.closure(args, kwargs) 149 return self._result.loss 150

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in closure(self, *args, *kwargs) 132 133 def closure(self, args: Any, **kwargs: Any) -> ClosureResult: --> 134 step_output = self._step_fn() 135 136 if step_output.closure_loss is None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _training_step(self, split_batch, batch_idx, opt_idx) 425 426 # manually capture logged metrics --> 427 training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values()) 428 self.trainer.strategy.post_training_step() 429

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_strategy_hook(self, hook_name, *args, *kwargs) 1763 1764 with self.profiler.profile(f"[Strategy]{self.strategy.class.name}.{hook_name}"): -> 1765 output = fn(args, **kwargs) 1766 1767 # restore current_fx when nested context

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/strategies/strategy.py in training_step(self, *args, *kwargs) 331 """ 332 with self.precision_plugin.train_step_context(): --> 333 return self.model.training_step(args, **kwargs) 334 335 def post_training_step(self):

/usr/local/lib/python3.7/dist-packages/nemo/utils/model_utils.py in wrap_training_step(wrapped, instance, args, kwargs) 362 @wrapt.decorator 363 def wrap_training_step(wrapped, instance: 'pl.LightningModule', args, kwargs): --> 364 output_dict = wrapped(*args, **kwargs) 365 366 if isinstance(output_dict, dict) and output_dict is not None and 'log' in output_dict:

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/models/rnnt_models.py in training_step(self, batch, batch_nb) 672 encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) 673 else: --> 674 encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) 675 del signal 676

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/models/rnnt_models.py in forward(self, input_signal, input_signal_length, processed_signal, processed_signal_length) 655 # Spec augment is not applied during evaluation/testing 656 if self.spec_augmentation is not None and self.training: --> 657 processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) 658 659 encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, *kwargs) 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/modules/audio_preprocessing.py in forward(self, input_spec, length) 508 # tensor must be on GPU and length must be provided 509 if self.spec_augment_numba is not None and spec_augment_launch_heuristics(augmented_spec, length): --> 510 augmented_spec = self.spec_augment_numba(input_spec=augmented_spec, length=length) 511 else: 512 augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length)

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, *kwargs) 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/nemo/core/classes/common.py in call(self, wrapped, instance, args, kwargs) 1082 1083 # Call the method - this can be forward, or any other callable method -> 1084 outputs = wrapped(*args, **kwargs) 1085 1086 instance._attach_and_validate_output_types(

/usr/local/lib/python3.7/dist-packages/torch/autograd/grad_mode.py in decorate_context(*args, kwargs) 25 def decorate_context(*args, *kwargs): 26 with self.clone(): ---> 27 return func(args, kwargs) 28 return cast(F, decorate_context) 29

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in forward(self, input_spec, length) 300 freq_masks=self.freq_masks, 301 time_masks=self.time_masks, --> 302 mask_value=self.mask_value, 303 ) 304

/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py in launch_spec_augment_kernel(x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, freq_masks, time_masks, mask_value) 167 168 # Launch CUDA kernel --> 169 spec_augment_kernel[blocks_per_grid, threads_per_block, stream, 0]( 170 x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, mask_value 171 )

/usr/local/lib/python3.7/dist-packages/numba/cuda/dispatcher.py in getitem(self, args) 567 if len(args) not in [2, 3, 4]: 568 raise ValueError('must specify at least the griddim and blockdim') --> 569 return self.configure(*args) 570 571 def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):

TypeError: unhashable type: 'list'

titu1994 commented 2 years ago

Ok I'll take a look once I'm back from vacation

titu1994 commented 2 years ago

This should now be fixed on main and the next release (r1.12.0)