Running out of memory when trying to run training using dora

❓ Questions

Hi there, I'm currently trying to train htdemucs in a colab environment, with around 40GB of vram. I made a simple variant of the default experiment and I'm running using !dora run -d. I keep running into the following torch memory error:

`Executor: Starting 1 worker processes for DDP. /usr/local/lib/python3.9/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default. See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information. ret = run_job( [04-01 19:11:04][demucs.train][INFO] - For logs, checkpoints and samples check /content/gdrive/MyDrive/demucs/outputs/xps/72c7316c [04-01 19:11:04][demucs.train][DEBUG] - {'dummy': None, 'dset': {'musdb': 'redacted', 'musdb_samplerate': 44100, 'use_musdb': True, 'wav': None, 'wav2': None, 'segment': 11, 'shift': 1, 'train_valid': False, 'full_cv': True, 'samplerate': 44100, 'channels': 2, 'normalize': True, 'metadata': '/content/gdrive/MyDrive/demucs/metadata', 'sources': ['drums', 'bass', 'other', 'vocals'], 'valid_samples': None, 'backend': None}, 'test': {'save': False, 'best': True, 'workers': 2, 'every': 20, 'split': True, 'shifts': 1, 'overlap': 0.25, 'sdr': True, 'metric': 'loss', 'nonhq': None}, 'epochs': 360, 'batch_size': 64, 'max_batches': None, 'optim': {'lr': 0.0003, 'momentum': 0.9, 'beta2': 0.999, 'loss': 'l1', 'optim': 'adam', 'weight_decay': 0, 'clip_grad': 0}, 'seed': 42, 'debug': False, 'valid_apply': True, 'flag': None, 'save_every': None, 'weights': [1.0, 1.0, 1.0, 1.0], 'augment': {'shift_same': False, 'repitch': {'proba': 0.2, 'max_tempo': 12}, 'remix': {'proba': 1, 'group_size': 4}, 'scale': {'proba': 1, 'min': 0.25, 'max': 1.25}, 'flip': True}, 'continue_from': None, 'continue_pretrained': None, 'pretrained_repo': None, 'continue_best': True, 'continue_opt': False, 'misc': {'num_workers': 1, 'num_prints': 4, 'show': False, 'verbose': True}, 'ema': {'epoch': [], 'batch': []}, 'use_train_segment': True, 'model_segment': None, 'model': 'htdemucs', 'demucs': {'channels': 64, 'growth': 2, 'depth': 6, 'rewrite': True, 'lstm_layers': 0, 'kernel_size': 8, 'stride': 4, 'context': 1, 'gelu': True, 'glu': True, 'norm_groups': 4, 'norm_starts': 4, 'dconv_depth': 2, 'dconv_mode': 1, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.0001, 'resample': True, 'normalize': False, 'rescale': 0.1}, 'hdemucs': {'channels': 48, 'channels_time': None, 'growth': 2, 'nfft': 4096, 'wiener_iters': 0, 'end_iters': 0, 'wiener_residual': False, 'cac': True, 'depth': 6, 'rewrite': True, 'hybrid': True, 'hybrid_old': False, 'multi_freqs': [], 'multi_freqs_depth': 3, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_mode': 1, 'dconv_depth': 2, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.001, 'rescale': 0.1}, 'torch_hdemucs': {'channels': 48, 'growth': 2, 'nfft': 4096, 'depth': 6, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_depth': 2, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.001}, 'htdemucs': {'channels': 48, 'channels_time': None, 'growth': 2, 'nfft': 4096, 'wiener_iters': 0, 'end_iters': 0, 'wiener_residual': False, 'cac': True, 'depth': 4, 'rewrite': True, 'multi_freqs': [], 'multi_freqs_depth': 3, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_mode': 1, 'dconv_depth': 2, 'dconv_comp': 8, 'dconv_init': 0.001, 'bottom_channels': 0, 't_layers': 5, 't_hidden_scale': 4.0, 't_heads': 8, 't_dropout': 0.0, 't_layer_scale': True, 't_gelu': True, 't_emb': 'sin', 't_max_positions': 10000, 't_max_period': 10000.0, 't_weight_pos_embed': 1.0, 't_cape_mean_normalize': True, 't_cape_augment': True, 't_cape_glob_loc_scale': [5000.0, 1.0, 1.4], 't_sin_random_shift': 0, 't_norm_in': True, 't_norm_in_group': False, 't_group_norm': False, 't_norm_first': True, 't_norm_out': True, 't_weight_decay': 0.0, 't_lr': None, 't_sparse_self_attn': False, 't_sparse_cross_attn': False, 't_mask_type': 'diag', 't_mask_random_seed': 42, 't_sparse_attn_window': 400, 't_global_window': 100, 't_sparsity': 0.95, 't_auto_sparsity': False, 't_cross_first': False, 'rescale': 0.1}, 'svd': {'penalty': 0, 'min_size': 0.1, 'dim': 1, 'niters': 2, 'powm': False, 'proba': 1, 'conv_only': False, 'convtr': False, 'bs': 1}, 'quant': {'diffq': None, 'qat': None, 'min_size': 0.2, 'group_size': 8}, 'dora': {'dir': 'outputs', 'exclude': ['misc.', 'slurm.', 'test.reval', 'flag', 'dset.backend']}, 'slurm': {'time': 4320, 'constraint': 'volta32gb', 'setup': ['module load cudnn/v8.4.1.50-cuda.11.6 NCCL/2.11.4-6-cuda.11.6 cuda/11.6']}} [04-01 19:11:04][demucs.train][DEBUG] - {'dummy': None, 'dset': {'musdb': 'redacted', 'musdb_samplerate': 44100, 'use_musdb': True, 'wav': None, 'wav2': None, 'segment': 11, 'shift': 1, 'train_valid': False, 'full_cv': True, 'samplerate': 44100, 'channels': 2, 'normalize': True, 'metadata': './metadata', 'sources': ['drums', 'bass', 'other', 'vocals'], 'valid_samples': None, 'backend': None}, 'test': {'save': False, 'best': True, 'workers': 2, 'every': 20, 'split': True, 'shifts': 1, 'overlap': 0.25, 'sdr': True, 'metric': 'loss', 'nonhq': None}, 'epochs': 360, 'batch_size': 64, 'max_batches': None, 'optim': {'lr': 0.0003, 'momentum': 0.9, 'beta2': 0.999, 'loss': 'l1', 'optim': 'adam', 'weight_decay': 0, 'clip_grad': 0}, 'seed': 42, 'debug': False, 'valid_apply': True, 'flag': None, 'save_every': None, 'weights': [1.0, 1.0, 1.0, 1.0], 'augment': {'shift_same': False, 'repitch': {'proba': 0.2, 'max_tempo': 12}, 'remix': {'proba': 1, 'group_size': 4}, 'scale': {'proba': 1, 'min': 0.25, 'max': 1.25}, 'flip': True}, 'continue_from': None, 'continue_pretrained': None, 'pretrained_repo': None, 'continue_best': True, 'continue_opt': False, 'misc': {'num_workers': 1, 'num_prints': 4, 'show': False, 'verbose': True}, 'ema': {'epoch': [], 'batch': []}, 'use_train_segment': True, 'model_segment': None, 'model': 'htdemucs', 'demucs': {'channels': 64, 'growth': 2, 'depth': 6, 'rewrite': True, 'lstm_layers': 0, 'kernel_size': 8, 'stride': 4, 'context': 1, 'gelu': True, 'glu': True, 'norm_groups': 4, 'norm_starts': 4, 'dconv_depth': 2, 'dconv_mode': 1, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.0001, 'resample': True, 'normalize': False, 'rescale': 0.1}, 'hdemucs': {'channels': 48, 'channels_time': None, 'growth': 2, 'nfft': 4096, 'wiener_iters': 0, 'end_iters': 0, 'wiener_residual': False, 'cac': True, 'depth': 6, 'rewrite': True, 'hybrid': True, 'hybrid_old': False, 'multi_freqs': [], 'multi_freqs_depth': 3, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_mode': 1, 'dconv_depth': 2, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.001, 'rescale': 0.1}, 'torch_hdemucs': {'channels': 48, 'growth': 2, 'nfft': 4096, 'depth': 6, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_depth': 2, 'dconv_comp': 4, 'dconv_attn': 4, 'dconv_lstm': 4, 'dconv_init': 0.001}, 'htdemucs': {'channels': 48, 'channels_time': None, 'growth': 2, 'nfft': 4096, 'wiener_iters': 0, 'end_iters': 0, 'wiener_residual': False, 'cac': True, 'depth': 4, 'rewrite': True, 'multi_freqs': [], 'multi_freqs_depth': 3, 'freq_emb': 0.2, 'emb_scale': 10, 'emb_smooth': True, 'kernel_size': 8, 'stride': 4, 'time_stride': 2, 'context': 1, 'context_enc': 0, 'norm_starts': 4, 'norm_groups': 4, 'dconv_mode': 1, 'dconv_depth': 2, 'dconv_comp': 8, 'dconv_init': 0.001, 'bottom_channels': 0, 't_layers': 5, 't_hidden_scale': 4.0, 't_heads': 8, 't_dropout': 0.0, 't_layer_scale': True, 't_gelu': True, 't_emb': 'sin', 't_max_positions': 10000, 't_max_period': 10000.0, 't_weight_pos_embed': 1.0, 't_cape_mean_normalize': True, 't_cape_augment': True, 't_cape_glob_loc_scale': [5000.0, 1.0, 1.4], 't_sin_random_shift': 0, 't_norm_in': True, 't_norm_in_group': False, 't_group_norm': False, 't_norm_first': True, 't_norm_out': True, 't_weight_decay': 0.0, 't_lr': None, 't_sparse_self_attn': False, 't_sparse_cross_attn': False, 't_mask_type': 'diag', 't_mask_random_seed': 42, 't_sparse_attn_window': 400, 't_global_window': 100, 't_sparsity': 0.95, 't_auto_sparsity': False, 't_cross_first': False, 'rescale': 0.1}, 'svd': {'penalty': 0, 'min_size': 0.1, 'dim': 1, 'niters': 2, 'powm': False, 'proba': 1, 'conv_only': False, 'convtr': False, 'bs': 1}, 'quant': {'diffq': None, 'qat': None, 'min_size': 0.2, 'group_size': 8}, 'dora': {'dir': 'outputs', 'exclude': ['misc.', 'slurm.', 'test.reval', 'flag', 'dset.backend']}, 'slurm': {'time': 4320, 'constraint': 'volta32gb', 'setup': ['module load cudnn/v8.4.1.50-cuda.11.6 NCCL/2.11.4-6-cuda.11.6 cuda/11.6']}} [04-01 19:11:04][dora.distrib][INFO] - world_size is 1, skipping init. [04-01 19:11:06][demucs.train][INFO] - train/valid set size: 18522 13 [04-01 19:11:06][demucs.solver][INFO] - ---------------------------------------------------------------------- [04-01 19:11:06][demucs.solver][INFO] - Training... Error executing job with overrides: ['variant=timurxp'] Traceback (most recent call last): File "/content/gdrive/MyDrive/demucs/demucs/train.py", line 243, in main solver.train() File "/content/gdrive/MyDrive/demucs/demucs/solver.py", line 195, in train metrics['train'] = self._run_one_epoch(epoch) File "/content/gdrive/MyDrive/demucs/demucs/solver.py", line 318, in _run_one_epoch estimate = self.dmodel(mix) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, kwargs) File "/content/gdrive/MyDrive/demucs/demucs/htdemucs.py", line 576, in forward x = encode(x, inject) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, *kwargs) File "/content/gdrive/MyDrive/demucs/demucs/hdemucs.py", line 149, in forward y = self.dconv(y) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(input, kwargs) File "/content/gdrive/MyDrive/demucs/demucs/demucs.py", line 153, in forward x = x + layer(x) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, *kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/container.py", line 204, in forward input = module(input) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(input, **kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/activation.py", line 643, in forward return F.glu(input, self.dim) File "/usr/local/lib/python3.9/dist-packages/torch/nn/functional.py", line 1493, in glu return torch._C._nn.glu(input, dim) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.19 GiB (GPU 0; 39.56 GiB total capacity; 36.81 GiB already allocated; 488.56 MiB free; 37.37 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace. Executor: Worker 0 died, killing all workers`

I looked online and found that the issue might be with my number of batches, but I'm not sure how to change this setting. any help or advice is greatly appreciated.

facebookresearch / demucs

Running out of memory when trying to run training using dora #460

❓ Questions