Lightning-AI / pytorch-lightning

Pretrain, finetune and deploy AI models on multiple GPUs, TPUs with zero code changes.
https://lightning.ai
Apache License 2.0
28k stars 3.36k forks source link

deepspeed strategy can't save checkpoint, TypeError: cannot pickle `torch._C._distributed_c10d.ProcessGroup` object #17369

Open dmitrymailk opened 1 year ago

dmitrymailk commented 1 year ago

Bug description

I try use https://github.com/ashleve/lightning-hydra-template with deepspeed strategy. Here is my fork https://github.com/dmitrymailk/ru_lm/tree/61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af

And I got this error TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

I don't think that it's a pytorch-lighting problem itsels because The error raise in deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py

'''Copyright The Microsoft DeepSpeed Team'''

import torch
from deepspeed.utils import logger, log_dist
from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
    CheckpointEngine

class TorchCheckpointEngine(CheckpointEngine):
    def __init__(self, config_params=None):
        super().__init__(config_params)

    def create(self, tag):
        log_dist(f"[Torch] Checkpoint {tag} is about to be saved!", ranks=[0])

    def save(self, state_dict, path: str):
        logger.info(f"[Torch] Saving {path}...")
        torch.save(state_dict, path) # <--------  THIS IS  LINE WITH ERROR
        logger.info(f"[Torch] Saved {path}.")
        return None

    def load(self, path: str, map_location=None):
        logger.info(f"[Torch] Loading checkpoint from {path}...")
        partition = torch.load(path, map_location=map_location)
        logger.info(f"[Torch] Loaded checkpoint from {path}.")
        return partition

    def commit(self, tag):
        logger.info(f"[Torch] Checkpoint {tag} is ready now!")
        return True

state_dict is

{'module': OrderedDict([('_forward_module.net.model.0.weight', tensor([[-0.0206, -0.0164, -0.0273,  ..., -0.0302,  0.0299, -0.0047],
        [-0.0128,  0.0170, -0.0113,  ...,  0.0350,  0.0331, -0.0160],
        [-0.0067,  0.0077,  0.0225,  ...,  0.0044, -0.0277,  0.0051],
        ...,
        [-0.0353, -0.0296,  0.0091,  ...,  0.0305, -0.0232, -0.0312],
        [ 0.0211, -0.0110,  0.0163,  ...,  0.0069,  0.0326,  0.0053],
        [-0.0086, -0.0325,  0.0274,  ..., -0.0282,  0.0301,  0.0071]],
       device='cuda:2')), ('_forward_module.net.model.0.bias', tensor([ 1.6634e-02, -2.3079e-03, -3.2068e-02,  2.1141e-02, -2.0570e-02,
        -1.6523e-02,  3.0869e-02,  3.1002e-02,  8.9699e-03,  8.0999e-03,
        -5.3077e-03, -1.5247e-03,  1.1600e-02, -2.3882e-02,  2.0400e-02,
         6.6980e-03, -1.5726e-02,  3.0201e-02, -3.0459e-02, -5.5736e-03,
        -3.2227e-02,  1.6418e-02, -3.3164e-03, -8.9103e-03, -2.0807e-02,
        -6.2269e-03,  2.7469e-03,  5.5335e-03, -6.0272e-03, -2.5161e-02,
         1.6865e-02, -2.6231e-02,  1.9226e-02, -1.2376e-02,  2.2611e-02,
         2.1642e-02, -2.1752e-02,  1.3505e-05,  7.2918e-03,  1.6172e-02,
        -2.2469e-02,  2.7463e-02,  2.1199e-02,  3.4017e-02, -2.8088e-02,
        -9.1580e-04, -9.2622e-03, -1.0225e-02, -1.9733e-02, -1.5048e-02,
         1.3339e-02, -1.0597e-02, -5.1447e-03,  6.7623e-03,  7.7667e-03,
         1.2265e-02,  3.9282e-03, -1.8352e-02,  3.9440e-03, -2.2286e-02,
        -3.5115e-02,  1.9813e-02,  3.2887e-02, -8.3252e-04], device='cuda:2')), ('_forward_module.net.model.1.weight', tensor([1.0168, 1.0103, 1.0170, 0.9606, 0.9720, 1.0054, 1.0024, 0.9416, 0.9631,
        1.0735, 0.9495, 0.9839, 1.0446, 0.9490, 0.9425, 0.9924, 0.9962, 1.0726,
        1.0224, 1.0065, 1.0098, 1.0007, 0.9632, 0.9839, 1.0079, 0.9911, 0.9914,
        1.0538, 1.0098, 0.9762, 1.0098, 0.9808, 1.0373, 0.9647, 0.9809, 0.9613,
        0.9817, 0.9978, 1.0200, 0.9712, 1.0052, 0.9922, 0.9766, 1.0005, 1.0585,
        1.0503, 1.0387, 1.0138, 0.9679, 0.9696, 0.9906, 1.0347, 1.0640, 1.0130,
        0.9582, 1.0201, 0.9812, 1.0380, 0.9930, 0.9797, 0.9500, 1.0297, 0.9632,
        0.9230], device='cuda:2')), ('_forward_module.net.model.1.bias', tensor([-0.1002,  0.0113,  0.0383, -0.0181, -0.0401, -0.0249, -0.0108, -0.0561,
        -0.0671, -0.0106,  0.0089, -0.0569,  0.0076, -0.0571, -0.0250, -0.0345,
        -0.0179, -0.0020, -0.0343, -0.0905, -0.0323, -0.0654, -0.0434, -0.0152,
        -0.0238, -0.0347, -0.0396, -0.0114, -0.0186, -0.0629, -0.0012, -0.0116,
         0.0067, -0.0646, -0.0147, -0.0136, -0.0839, -0.0536,  0.0047,  0.0185,
        -0.0690, -0.0223,  0.0031, -0.0109,  0.0104, -0.0046, -0.0495, -0.0645,
        -0.0083, -0.0269,  0.0071, -0.0411,  0.0381, -0.0577, -0.0245, -0.0126,
        -0.0421,  0.0048, -0.0383, -0.0071, -0.0592,  0.0024, -0.0674, -0.0339],
       device='cuda:2')), ('_forward_module.net.model.1.running_mean', tensor([-0.2770,  0.7688,  0.9060,  0.1643,  1.0290, -0.1429, -0.4767,  1.2568,
        -0.4853, -2.1052,  1.8014,  0.6222,  1.4748, -0.5060,  0.0468,  0.7766,
        -0.3606,  0.4830,  1.1301, -0.8858,  0.5095,  1.7616,  1.2985, -0.7447,
         1.2914, -0.5072,  1.1782, -0.6020,  0.6702,  0.0687,  0.3902, -0.0578,
         0.8498,  0.5335,  0.6371, -1.8661, -0.2625, -1.0885, -0.3993,  0.1109,
         0.0751, -0.1520,  1.2330, -0.5041, -0.8100, -0.6048, -0.3007, -1.1550,
        -0.5078,  0.9425,  0.4095,  0.6797, -1.1699, -1.1110,  1.6058, -0.5655,
         0.3667, -0.0421, -0.1447,  0.1583, -0.9060,  1.3481, -0.1281,  0.7023],
       device='cuda:2')), ('_forward_module.net.model.1.running_var', tensor([2.4604, 3.6512, 2.2266, 2.6904, 1.9039, 2.1977, 2.3890, 2.4403, 2.5475,
        2.8812, 2.1730, 2.3935, 4.2853, 1.6594, 2.6012, 1.7956, 2.5534, 2.8410,
        3.0139, 2.4982, 2.1857, 3.6985, 1.8812, 2.1615, 3.1596, 2.6286, 1.7740,
        3.0028, 3.2665, 2.3736, 3.8532, 2.2592, 1.9263, 2.4511, 3.1454, 2.2164,
        2.1486, 3.0658, 4.0342, 2.3159, 3.2614, 1.7401, 2.6082, 2.1396, 2.1671,
        4.3761, 2.3555, 3.5225, 2.3165, 2.7841, 2.1779, 4.6677, 2.8057, 2.6997,
        1.7716, 1.9607, 2.5103, 3.0575, 2.2511, 2.5046, 1.3976, 2.5423, 2.1747,
        2.3439], device='cuda:2')), ('_forward_module.net.model.1.num_batches_tracked', tensor(430, device='cuda:2')), ('_forward_module.net.model.3.weight', tensor([[ 0.0666,  0.0439, -0.0677,  ..., -0.1074,  0.0113, -0.0350],
        [ 0.0209, -0.0946, -0.0402,  ..., -0.0770, -0.0062,  0.0970],
        [-0.0347,  0.0199, -0.0589,  ..., -0.0599, -0.0072, -0.0322],
        ...,
        [-0.0400,  0.1107,  0.1408,  ...,  0.0782, -0.0053,  0.0371],
        [-0.0354, -0.0415, -0.0526,  ...,  0.1302, -0.0607, -0.0982],
        [-0.0489, -0.0171, -0.0607,  ...,  0.0260,  0.0699, -0.0023]],
       device='cuda:2')), ('_forward_module.net.model.3.bias', tensor([ 0.0371, -0.1201,  0.1242, -0.0917,  0.1201,  0.1125,  0.1002, -0.0976,
         0.0511, -0.0445, -0.1236,  0.1131, -0.0694, -0.0412,  0.0963,  0.1085,
         0.0458, -0.1029, -0.0773, -0.1115, -0.1080, -0.0374,  0.0575,  0.1134,
         0.1030, -0.1213,  0.0181, -0.0951, -0.0121,  0.0057, -0.0873, -0.0170,
        -0.0819,  0.0476,  0.0057, -0.0765,  0.1016,  0.0075,  0.0647,  0.0446,
        -0.0293,  0.0005, -0.0510,  0.1022, -0.0609,  0.0789,  0.0106,  0.0396,
         0.0795,  0.0585,  0.0888,  0.0950,  0.0060, -0.0476, -0.0320, -0.0433,
         0.0468, -0.0734, -0.0697, -0.1227,  0.0337, -0.0054, -0.0873,  0.0732,
         0.0415, -0.1204, -0.0345,  0.0072,  0.0551,  0.0011, -0.0251, -0.1187,
         0.0916,  0.0826,  0.0491,  0.0969,  0.0570,  0.0681, -0.1145, -0.0844,
        -0.0672, -0.1078, -0.1102,  0.1186,  0.0466, -0.0408, -0.0104,  0.0077,
        -0.1125,  0.0287, -0.1237, -0.0869, -0.0023,  0.0279,  0.1238, -0.0968,
        -0.1007,  0.0801, -0.0582,  0.0211, -0.0789,  0.0735, -0.1026,  0.0292,
         0.0482,  0.0025,  0.1231, -0.1071, -0.1202, -0.0087, -0.0777,  0.0662,
         0.0407,  0.1197,  0.0484,  0.1013, -0.0494, -0.1110,  0.1213,  0.0530,
         0.0355,  0.0203, -0.0327, -0.1022,  0.0537,  0.0855, -0.1248,  0.1174],
       device='cuda:2')), ...]), 'buffer_names': ['_forward_module.net.model.1.running_mean', '_forward_module.net.model.1.running_var', '_forward_module.net.model.1.num_batches_tracked', '_forward_module.net.model.4.running_mean', '_forward_module.net.model.4.running_var', '_forward_module.net.model.4.num_batches_tracked', '_forward_module.net.model.7.running_mean', '_forward_module.net.model.7.running_var', '_forward_module.net.model.7.num_batches_tracked'], 'optimizer': None, 'param_shapes': [OrderedDict([...])], 'lr_scheduler': None, 'data_sampler': None, 'random_ltd': None, 'sparse_tensor_module_names': {}, 'skipped_steps': 0, 'global_steps': 430, 'global_samples': 430, 'dp_world_size': 1, 'mp_world_size': 1, 'ds_config': {'zero_allow_untested_optimizer': True, 'zero_optimization': {...}, 'activation_checkpointing': {...}, 'aio': {...}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}, ...}

What version are you seeing the problem on?

2.0+

How to reproduce the bug

git clone https://github.com/dmitrymailk/ru_lm
cd ru_lm
git checkout 61ab735110b3c80a3cb3d58b3d7c5c05d4cf56af
pip install -r requirements.txt
python src/train.py

you must change devices in configs/trainer/deepspeed.yaml

Error messages and logs


[2023-04-14 00:57:35,767][src.utils.utils][INFO] - Enforcing tags! <cfg.extras.enforce_tags=True>
[2023-04-14 00:57:35,773][src.utils.utils][INFO] - Printing config tree with Rich! <cfg.extras.print_config=True>
[2023-04-14 00:57:35,773][src.utils.rich_utils][WARNING] - Field 'logger' not found in config. Skipping 'logger' config printing...
CONFIG
├── data
│   └── _target_: src.data.mnist_datamodule.MNISTDataModule                                                                    
│       data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/                                                                   
│       batch_size: 128                                                                                                        
│       train_val_test_split:                                                                                                  
│       - 55000                                                                                                                
│       - 5000                                                                                                                 
│       - 10000                                                                                                                
│       num_workers: 0                                                                                                         
│       pin_memory: false                                                                                                      
│                                                                                                                              
├── model
│   └── _target_: src.models.mnist_module.MNISTLitModule                                                                       
│       optimizer:                                                                                                             
│         _target_: torch.optim.Adam                                                                                           
│         _partial_: true                                                                                                      
│         lr: 0.001                                                                                                            
│         weight_decay: 0.0                                                                                                    
│       scheduler:                                                                                                             
│         _target_: torch.optim.lr_scheduler.ReduceLROnPlateau                                                                 
│         _partial_: true                                                                                                      
│         mode: min                                                                                                            
│         factor: 0.1                                                                                                          
│         patience: 10                                                                                                         
│       net:                                                                                                                   
│         _target_: src.models.components.simple_dense_net.SimpleDenseNet                                                      
│         input_size: 784                                                                                                      
│         lin1_size: 64                                                                                                        
│         lin2_size: 128                                                                                                       
│         lin3_size: 64                                                                                                        
│         output_size: 10                                                                                                      
│                                                                                                                              
├── callbacks
│   └── model_checkpoint:                                                                                                      
│         _target_: lightning.pytorch.callbacks.ModelCheckpoint                                                                
│         dirpath: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35/checkpoints                        
│         filename: epoch_{epoch:03d}                                                                                          
│         monitor: val/acc                                                                                                     
│         verbose: false                                                                                                       
│         save_last: true                                                                                                      
│         save_top_k: 1                                                                                                        
│         mode: max                                                                                                            
│         auto_insert_metric_name: false                                                                                       
│         save_weights_only: false                                                                                             
│         every_n_train_steps: null                                                                                            
│         train_time_interval: null                                                                                            
│         every_n_epochs: null                                                                                                 
│         save_on_train_epoch_end: null                                                                                        
│       early_stopping:                                                                                                        
│         _target_: lightning.pytorch.callbacks.EarlyStopping                                                                  
│         monitor: val/acc                                                                                                     
│         min_delta: 0.0                                                                                                       
│         patience: 100                                                                                                        
│         verbose: false                                                                                                       
│         mode: max                                                                                                            
│         strict: true                                                                                                         
│         check_finite: true                                                                                                   
│         stopping_threshold: null                                                                                             
│         divergence_threshold: null                                                                                           
│         check_on_train_epoch_end: null                                                                                       
│       model_summary:                                                                                                         
│         _target_: lightning.pytorch.callbacks.RichModelSummary                                                               
│         max_depth: -1                                                                                                        
│       rich_progress_bar:                                                                                                     
│         _target_: lightning.pytorch.callbacks.RichProgressBar                                                                
│                                                                                                                              
├── trainer
│   └── _target_: lightning.pytorch.trainer.Trainer                                                                            
│       default_root_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35                             
│       min_epochs: 1                                                                                                          
│       max_epochs: 2                                                                                                          
│       check_val_every_n_epoch: 1                                                                                             
│       deterministic: false                                                                                                   
│       accelerator: gpu                                                                                                       
│       devices:                                                                                                               
│       - 2                                                                                                                    
│       strategy: deepspeed                                                                                                    
│                                                                                                                              
├── paths
│   └── root_dir: /cephfs/home/kosenko/deepspeed/ru_lm                                                                         
│       data_dir: /cephfs/home/kosenko/deepspeed/ru_lm/data/                                                                   
│       log_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/                                                                    
│       output_dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35                                   
│       work_dir: /cephfs/home/kosenko/deepspeed/ru_lm                                                                         
│                                                                                                                              
├── extras
│   └── ignore_warnings: false                                                                                                 
│       enforce_tags: true                                                                                                     
│       print_config: true                                                                                                     
│                                                                                                                              
├── task_name
│   └── train                                                                                                                  
├── tags
│   └── ['dev']                                                                                                                
├── train
│   └── True                                                                                                                   
├── test
│   └── False                                                                                                                  
├── compile
│   └── False                                                                                                                  
├── ckpt_path
│   └── None                                                                                                                   
└── seed
    └── None                                                                                                                   
[2023-04-14 00:57:35,822][__main__][INFO] - Instantiating datamodule <src.data.mnist_datamodule.MNISTDataModule>
[2023-04-14 00:57:35,825][__main__][INFO] - Instantiating model <src.models.mnist_module.MNISTLitModule>
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:197: UserWarning: Attribute 'net' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['net'])`.
  rank_zero_warn(
[2023-04-14 00:57:35,886][__main__][INFO] - Instantiating callbacks...
[2023-04-14 00:57:35,886][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.ModelCheckpoint>
[2023-04-14 00:57:35,890][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.EarlyStopping>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichModelSummary>
[2023-04-14 00:57:35,891][src.utils.instantiators][INFO] - Instantiating callback <lightning.pytorch.callbacks.RichProgressBar>
[2023-04-14 00:57:35,892][__main__][INFO] - Instantiating loggers...
[2023-04-14 00:57:35,892][src.utils.instantiators][WARNING] - No logger configs found! Skipping...
[2023-04-14 00:57:35,893][__main__][INFO] - Instantiating trainer <lightning.pytorch.trainer.Trainer>
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[2023-04-14 00:57:36,181][__main__][INFO] - Starting training!
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
[2023-04-14 00:57:36,548][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
[2023-04-14 00:57:36,548] [WARNING] [deepspeed.py:637:_auto_select_batch_size] Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. To ensure DeepSpeed logging remains correct, please manually pass the plugin with the batch size, `Trainer(strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=batch_size))`.
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:2 to store for rank: 0
[2023-04-14 00:57:38,095][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.10430693626403809 seconds
Rank: 0 partition count [1] and sizes[(67978, False)] 
Using /cephfs/home/kosenko/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0004889965057373047 seconds
┏━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
┃    ┃ Name         ┃ Type               ┃ Params ┃
┡━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
│ 0  │ net          │ SimpleDenseNet     │ 68.0 K │
│ 1  │ net.model    │ Sequential         │ 68.0 K │
│ 2  │ net.model.0  │ Linear             │ 50.2 K │
│ 3  │ net.model.1  │ BatchNorm1d        │    128 │
│ 4  │ net.model.2  │ ReLU               │      0 │
│ 5  │ net.model.3  │ Linear             │  8.3 K │
│ 6  │ net.model.4  │ BatchNorm1d        │    256 │
│ 7  │ net.model.5  │ ReLU               │      0 │
│ 8  │ net.model.6  │ Linear             │  8.3 K │
│ 9  │ net.model.7  │ BatchNorm1d        │    128 │
│ 10 │ net.model.8  │ ReLU               │      0 │
│ 11 │ net.model.9  │ Linear             │    650 │
│ 12 │ criterion    │ CrossEntropyLoss   │      0 │
│ 13 │ train_acc    │ MulticlassAccuracy │      0 │
│ 14 │ val_acc      │ MulticlassAccuracy │      0 │
│ 15 │ test_acc     │ MulticlassAccuracy │      0 │
│ 16 │ train_loss   │ MeanMetric         │      0 │
│ 17 │ val_loss     │ MeanMetric         │      0 │
│ 18 │ test_loss    │ MeanMetric         │      0 │
│ 19 │ val_acc_best │ MaxMetric          │      0 │
└────┴──────────────┴────────────────────┴────────┘
Trainable params: 68.0 K                                                                                                       
Non-trainable params: 0                                                                                                        
Total params: 68.0 K                                                                                                           
Total estimated model params size (MB): 0                                                                                      
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430: 
PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing 
the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` init to 
improve performance.
  rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:432: 
PossibleUserWarning: It is recommended to use `self.log('val/acc_best', ..., sync_dist=True)` when logging on epoch level in 
distributed setting to accumulate the metric across devices.
  warning_cache.warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:430: 
PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider 
increasing the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` 
init to improve performance.
  rank_zero_warn(
/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being 
deprecated, use kwargs instead. Refer to 
https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
  warnings.warn(
Epoch 0/1  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 430/430 0:00:13 • 0:00:00 32.62it/s val/loss: 0.114 val/acc: 0.967          
                                                                                       val/acc_best: 0.967 train/loss: 0.327   
                                                                                       train/acc: 0.92                         
[2023-04-14 00:57:54,771][src.utils.utils][ERROR] - 
Traceback (most recent call last):
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
    metric_dict, object_dict = task_func(cfg=cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
    trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
    call._call_and_handle_interrupt(
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
    return function(*args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
    results = self._run_stage()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
    self.fit_loop.run()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
    self.on_advance_end()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
    call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
    fn(trainer, trainer.lightning_module, *args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
    self._save_topk_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
    self._save_monitor_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
    self._update_best_and_save(current, trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
    self._save_checkpoint(trainer, filepath)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
    trainer.save_checkpoint(filepath, self.save_weights_only)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
    self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
    self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
    self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
    self._save_checkpoint(save_dir, tag, client_state=client_state)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
    self.checkpoint_engine.save(state, save_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
    torch.save(state_dict, path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
    _save(obj, opened_zipfile, pickle_module, pickle_protocol)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
    pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
[2023-04-14 00:57:54,775][src.utils.utils][INFO] - Output dir: /cephfs/home/kosenko/deepspeed/ru_lm/logs/train/runs/2023-04-14_00-57-35
Error executing job with overrides: []
Traceback (most recent call last):
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 117, in main
    metric_dict, _ = train(cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 75, in wrap
    raise ex
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/utils/utils.py", line 65, in wrap
    metric_dict, object_dict = task_func(cfg=cfg)
  File "/cephfs/home/kosenko/deepspeed/ru_lm/src/train.py", line 89, in train
    trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 520, in fit
    call._call_and_handle_interrupt(
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 92, in launch
    return function(*args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 935, in _run
    results = self._run_stage()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 978, in _run_stage
    self.fit_loop.run()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run
    self.on_advance_end()
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py", line 369, in on_advance_end
    call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=True)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 190, in _call_callback_hooks
    fn(trainer, trainer.lightning_module, *args, **kwargs)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 303, in on_train_epoch_end
    self._save_topk_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 360, in _save_topk_checkpoint
    self._save_monitor_checkpoint(trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 658, in _save_monitor_checkpoint
    self._update_best_and_save(current, trainer, monitor_candidates)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 709, in _update_best_and_save
    self._save_checkpoint(trainer, filepath)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
    trainer.save_checkpoint(filepath, self.save_weights_only)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1262, in save_checkpoint
    self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 500, in save_checkpoint
    self.trainer.strategy.save_checkpoint(_checkpoint, filepath, storage_options=storage_options)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 772, in save_checkpoint
    self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3133, in save_checkpoint
    self._save_checkpoint(save_dir, tag, client_state=client_state)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3345, in _save_checkpoint
    self.checkpoint_engine.save(state, save_path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
    torch.save(state_dict, path)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 441, in save
    _save(obj, opened_zipfile, pickle_module, pickle_protocol)
  File "/home/kosenko/miniconda3/lib/python3.10/site-packages/torch/serialization.py", line 653, in _save
    pickler.dump(obj)
TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.

Environment

Current environment * CUDA: - GPU: - NVIDIA A100-SXM4-40GB - NVIDIA A100-SXM4-40GB - NVIDIA A100-SXM4-40GB - NVIDIA A100-SXM4-40GB - available: True - version: 11.8 * Lightning: - lightning: 2.0.1.post0 - lightning-cloud: 0.5.33 - lightning-colossalai: 0.1.0 - lightning-utilities: 0.8.0 - pytorch-lightning: 2.0.1.post0 - torch: 2.0.0+cu118 - torchaudio: 2.0.1+cu118 - torchmetrics: 0.11.4 - torchvision: 0.15.1+cu118 * Packages: - absl-py: 1.4.0 - accelerate: 0.18.0 - aiofiles: 23.1.0 - aiohttp: 3.8.4 - aiosignal: 1.3.1 - alembic: 1.10.3 - altair: 4.2.2 - antlr4-python3-runtime: 4.9.3 - anyio: 3.6.2 - apex: 0.1 - appdirs: 1.4.4 - arrow: 1.2.3 - asttokens: 2.2.1 - async-timeout: 4.0.2 - attrs: 22.2.0 - autopage: 0.5.1 - backcall: 0.2.0 - backports.functools-lru-cache: 1.6.4 - bcrypt: 4.0.1 - beautifulsoup4: 4.12.2 - bitsandbytes: 0.37.2 - black: 23.3.0 - blessed: 1.20.0 - boltons: 23.0.0 - brotlipy: 0.7.0 - cachetools: 5.3.0 - certifi: 2022.12.7 - cffi: 1.15.1 - cfgv: 3.3.1 - charset-normalizer: 2.0.4 - click: 8.1.3 - cliff: 4.2.0 - cmaes: 0.9.1 - cmake: 3.25.0 - cmd2: 2.4.3 - colorlog: 6.7.0 - colossalai: 0.2.8 - conda: 23.3.1 - conda-content-trust: 0.1.3 - conda-package-handling: 2.0.2 - conda-package-streaming: 0.7.0 - contexttimer: 0.3.3 - contourpy: 1.0.7 - croniter: 1.3.14 - cryptography: 38.0.4 - cycler: 0.11.0 - datasets: 2.11.0 - dateutils: 0.6.12 - debugpy: 1.5.1 - decorator: 5.1.1 - deepdiff: 6.3.0 - deepspeed: 0.8.3 - dill: 0.3.6 - distlib: 0.3.6 - docker-pycreds: 0.4.0 - einops: 0.6.0 - entrypoints: 0.4 - evaluate: 0.4.0 - exceptiongroup: 1.1.1 - executing: 1.2.0 - fabric: 3.0.0 - fastapi: 0.88.0 - ffmpy: 0.3.0 - filelock: 3.9.0 - fire: 0.5.0 - flash-attn: 0.2.8 - flit-core: 3.8.0 - fonttools: 4.39.3 - frozenlist: 1.3.3 - fschat: 0.1.10 - fsspec: 2023.4.0 - gitdb: 4.0.10 - gitpython: 3.1.31 - gmpy2: 2.1.2 - google-auth: 2.17.3 - google-auth-oauthlib: 1.0.0 - gradio: 3.23.0 - gradio-client: 0.0.8 - greenlet: 2.0.2 - grpcio: 1.53.0 - h11: 0.14.0 - hjson: 3.1.0 - html2text: 2020.1.16 - httpcore: 0.16.3 - httpx: 0.23.3 - huggingface-hub: 0.13.4 - hydra-colorlog: 1.2.0 - hydra-core: 1.3.2 - hydra-optuna-sweeper: 1.2.0 - identify: 2.5.22 - idna: 3.4 - importlib-metadata: 6.3.0 - iniconfig: 2.0.0 - inquirer: 3.1.3 - invoke: 2.0.0 - ipykernel: 6.15.0 - ipython: 8.12.0 - itsdangerous: 2.1.2 - jedi: 0.18.2 - jinja2: 3.1.2 - joblib: 1.2.0 - jsonlines: 3.1.0 - jsonpatch: 1.32 - jsonpointer: 2.1 - jsonschema: 4.17.3 - jupyter-client: 7.3.4 - jupyter-core: 4.12.0 - kiwisolver: 1.4.4 - lightning: 2.0.1.post0 - lightning-cloud: 0.5.33 - lightning-colossalai: 0.1.0 - lightning-utilities: 0.8.0 - linkify-it-py: 2.0.0 - lit: 15.0.7 - loralib: 0.1.1 - mako: 1.2.4 - markdown: 3.4.3 - markdown-it-py: 2.2.0 - markdown2: 2.4.8 - markupsafe: 2.1.1 - matplotlib: 3.7.1 - matplotlib-inline: 0.1.6 - mdit-py-plugins: 0.3.3 - mdurl: 0.1.2 - mkl-fft: 1.3.1 - mkl-random: 1.2.2 - mkl-service: 2.4.0 - mpmath: 1.2.1 - multidict: 6.0.4 - multiprocess: 0.70.14 - mypy-extensions: 1.0.0 - nest-asyncio: 1.5.6 - networkx: 2.8.4 - ninja: 1.11.1 - nodeenv: 1.7.0 - numpy: 1.23.5 - nvidia-cublas-cu11: 11.10.3.66 - nvidia-cuda-nvrtc-cu11: 11.7.99 - nvidia-cuda-runtime-cu11: 11.7.99 - nvidia-cudnn-cu11: 8.5.0.96 - oauthlib: 3.2.2 - omegaconf: 2.3.0 - optuna: 2.10.1 - ordered-set: 4.1.0 - orjson: 3.8.10 - packaging: 23.0 - pandas: 2.0.0 - paramiko: 3.1.0 - parso: 0.8.3 - pathspec: 0.11.1 - pathtools: 0.1.2 - pbr: 5.11.1 - peft: 0.3.0.dev0 - pexpect: 4.8.0 - pickleshare: 0.7.5 - pillow: 9.4.0 - pip: 22.3.1 - platformdirs: 3.2.0 - pluggy: 1.0.0 - pre-commit: 3.2.2 - prettytable: 3.7.0 - prompt-toolkit: 3.0.38 - protobuf: 3.20.3 - psutil: 5.9.4 - ptyprocess: 0.7.0 - pure-eval: 0.2.2 - py-cpuinfo: 9.0.0 - pyarrow: 11.0.0 - pyasn1: 0.4.8 - pyasn1-modules: 0.2.8 - pycosat: 0.6.4 - pycparser: 2.21 - pydantic: 1.10.7 - pydeprecate: 0.3.2 - pydub: 0.25.1 - pygments: 2.14.0 - pyjwt: 2.6.0 - pynacl: 1.5.0 - pyopenssl: 22.0.0 - pyparsing: 3.0.9 - pyperclip: 1.8.2 - pyrootutils: 1.0.4 - pyrsistent: 0.19.3 - pysocks: 1.7.1 - pytest: 7.3.0 - python-dateutil: 2.8.2 - python-dotenv: 1.0.0 - python-editor: 1.0.4 - python-multipart: 0.0.6 - pytorch-lightning: 2.0.1.post0 - pytz: 2023.3 - pyyaml: 6.0 - pyzmq: 23.2.0 - readchar: 4.0.5 - regex: 2023.3.23 - requests: 2.28.1 - requests-oauthlib: 1.3.1 - responses: 0.18.0 - rfc3986: 1.5.0 - rich: 13.3.3 - rsa: 4.9 - ruamel.yaml: 0.17.21 - ruamel.yaml.clib: 0.2.6 - safetensors: 0.3.0 - scikit-learn: 1.2.2 - scipy: 1.10.1 - semantic-version: 2.10.0 - sentencepiece: 0.1.97 - sentry-sdk: 1.19.1 - setproctitle: 1.3.2 - setuptools: 65.6.3 - six: 1.16.0 - smmap: 5.0.0 - sniffio: 1.3.0 - soupsieve: 2.4 - sqlalchemy: 2.0.9 - stack-data: 0.6.2 - starlette: 0.22.0 - starsessions: 1.3.0 - stevedore: 5.0.0 - svgwrite: 1.4.3 - sympy: 1.11.1 - tensorboard: 2.12.2 - tensorboard-data-server: 0.7.0 - tensorboard-plugin-wit: 1.8.1 - termcolor: 2.2.0 - threadpoolctl: 3.1.0 - tokenize-rt: 5.0.0 - tokenizers: 0.13.3 - tomli: 2.0.1 - toolz: 0.12.0 - torch: 2.0.0+cu118 - torchaudio: 2.0.1+cu118 - torchmetrics: 0.11.4 - torchvision: 0.15.1+cu118 - tornado: 6.1 - tqdm: 4.64.1 - traitlets: 5.9.0 - transformers: 4.28.0.dev0 - triton: 2.0.0 - typing-extensions: 4.4.0 - tzdata: 2023.3 - uc-micro-py: 1.0.1 - urllib3: 1.26.14 - uvicorn: 0.21.1 - virtualenv: 20.21.0 - wandb: 0.14.2 - wavedrom: 2.0.3.post3 - wcwidth: 0.2.6 - websocket-client: 1.5.1 - websockets: 11.0.1 - werkzeug: 2.2.3 - wheel: 0.37.1 - xxhash: 3.2.0 - yarl: 1.8.2 - zipp: 3.15.0 - zstandard: 0.18.0 * System: - OS: Linux - architecture: - 64bit - ELF - processor: x86_64 - python: 3.10.9 - version: #76~20.04.1-Ubuntu SMP Mon Mar 20 15:54:19 UTC 2023

More info

No response

cc @awaelchli

keunwoochoi commented 1 year ago

having the same issue.

SpirinEgor commented 1 year ago

Hello! I ran into the same issue when using Lightning+DeepSpeed. @keunwoochoi @dmitrymailk were you able to fix that?

keunwoochoi commented 1 year ago

hi @SpirinEgor, i can't remember how i fixed it. i tried installing torch with conda and right cu version, install deepspeed with conda, etc. i don't use class TorchCheckpointEngine(CheckpointEngine) or something - i only use lightning checkpointing if it matters.

dmitrymailk commented 1 year ago

hi @SpirinEgor I didn't fix that and I just switched to vanilla deepspeed trainer. It's much more stable and simple.

alexanderswerdlow commented 1 year ago

Same issue here.

deepspeed: 0.9.2 torch: 2.0.0 lightning: 2.0.2

stale[bot] commented 1 year ago

This issue has been automatically marked as stale because it hasn't had any recent activity. This issue will be closed in 7 days if no further activity occurs. Thank you for your contributions - the Lightning Team!

someshfengde commented 1 year ago

same issue here

lainxx commented 1 year ago

same issue here

aced125 commented 1 year ago

same issue

chadHGY commented 8 months ago

Same issue.

deepspeed: 0.12.6 torch: 2.1.0+cu121 lightning: 2.1.3

jonmun commented 7 months ago

same issue

jwliu36 commented 3 months ago

Having the same issue when using DeepSpeedStrategy https://github.com/Lightning-AI/pytorch-lightning/blob/master/src/lightning/pytorch/strategies/deepspeed.py

train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 389, in _save_checkpoint
train/0 [0]:    trainer.save_checkpoint(filepath, self.save_weights_only)
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1381, in save_checkpoint
train/0 [0]:    self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/pytorch_lightning/strategies/deepspeed.py", line 648, in save_checkpoint
train/0 [0]:    self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 3118, in save_checkpoint
train/0 [0]:    self._save_checkpoint(save_dir,
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 3337, in _save_checkpoint
train/0 [0]:    self.checkpoint_engine.save(state, save_path)
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 22, in save
train/0 [0]:    torch.save(state_dict, path)
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/torch/serialization.py", line 629, in save
train/0 [0]:    _save(obj, opened_zipfile, pickle_module, pickle_protocol, _disable_byteorder_record)
train/0 [0]:  File "/home/jwliu/.conda/envs/cfms/lib/python3.9/site-packages/torch/serialization.py", line 841, in _save
train/0 [0]:    pickler.dump(obj)
train/0 [0]:TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

Checked the keys which make sense because state_dict and optmizer_states are excluded.

train/0 [2]:Before: dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters', 'datamodule_hparams_name', 'datamodule_hyper_parameters'])

train/0 [2]:After: dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'loops', 'callbacks', 'lr_schedulers', 'hparams_name', 'hyper_parameters', 'datamodule_hparams_name', 'datamodule_hyper_parameters'])