The process freezed in loading model weights from checkpoint when multiprocessing

Bug description

How to reproduce the bug

.py
# from ..optim import MemAETrainer
import pytorch_lightning as pl
import os
import sys
from pytorch_lightning import seed_everything
import argparse

sys.path.append(os.getcwd())
sys.path.append(os.path.dirname(__file__))
from models import CIFAR10LeNetSvdd
from models import CIFAR10LeNetAutoencoder
import time
from datasets import cifar10_dataset
from datasets import CIFAR10DataModel
from datetime import datetime

def cifar10_lenet(normal_class,
                  pre_epochs,
                  epochs,
                  seed,
                  log_path,
                  enable_progress_bar=False):
    seed_everything(seed, workers=True)
    log_path = log_path + datetime.now().strftime(
        '%Y-%m-%d-%H%M%S')
    cifar10 = CIFAR10DataModel(batch_size=64, normal_class=normal_class)
    auto_enc = CIFAR10LeNetAutoencoder()
    trainer = pl.Trainer(accelerator="gpu",
                         devices=1,
                         default_root_dir=log_path,
                         max_epochs=pre_epochs,
                         enable_progress_bar=enable_progress_bar)
    trainer.fit(model=auto_enc, datamodule=cifar10)
    trainer.test(datamodule=cifar10)
    at_enc_svdd = CIFAR10LeNetSvdd()
    at_enc_svdd.load_state_dict(auto_enc.state_dict())
    at_enc_svdd.init_center_c(auto_enc, cifar10.train_dataloader())
    trainer = pl.Trainer(accelerator="gpu",
                         devices=1,
                         default_root_dir=log_path,
                         max_epochs=epochs,
                         enable_progress_bar=enable_progress_bar)
    # model.svdd.init_center_c(model.train_set, memae)
    trainer.fit(model=at_enc_svdd, datamodule=cifar10)
    trainer.test(datamodule=cifar10)

if __name__ == '__main__':

    start_time = time.process_time()
    parser = argparse.ArgumentParser(description="Deep SVDD")
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--normal_class', type=int, default=0)
    parser.add_argument('--pre_epochs', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=0)
    parser.add_argument("--progress_bar", action="store_true")
    parser.add_argument("--log_path", type=str)
    args = parser.parse_args()
    cifar10_lenet(normal_class=args.normal_class,
                  pre_epochs=args.pre_epochs,
                  epochs=args.epochs,
                  seed=args.seed,
                  enable_progress_bar=args.progress_bar,
                  log_path=args.log_path)

    end_time = time.process_time()
    m, s = divmod(end_time - start_time, 60)
    h, m = divmod(m, 60)
    print("process took %02d:%02d:%02d" % (h, m, s))

```sh
SCRIPT_DIR=$(cd $(dirname ${BASH_SOURCE[0]}); pwd)
seed=${2:-9}
n_epochs=2
pretrain_n_epochs=1
objective=pl_svdd/
log_dir=$SCRIPT_DIR/../bash-log/$objective
export CUDA_VISIBLE_DEVICES=0
    for i in {0..4}; do
        {
            python $SCRIPT_DIR/../main/cifar10_lenet.py --seed $seed --pre_epochs $pretrain_n_epochs --epochs $n_epochs --normal_class $i --log_path $log_dir
        } &
        sleep 4
        # {
        #     sleep 5
        #     echo "python main.py --dataset cifar10 --objective $objective --chnum_in 3 --n_epochs $n_epochs --seed $seed --pretrain_n_epochs $pretrain_n_epochs --mem_dim $mem_dim --normal_class $i --gpu_id 0 --learning_approach $learning_approach >> $log_file"
        # } &
    done;
    export CUDA_VISIBLE_DEVICES=1
    for i in {5..9}; do
        {
            python $SCRIPT_DIR/../main/cifar10_lenet.py --seed $seed --pre_epochs $pretrain_n_epochs --epochs $n_epochs --normal_class $i --log_path $log_dir
        } &
        sleep 4        

        # {
        #     sleep 5
        #     echo "python main.py --dataset cifar10 --objective $objective --chnum_in 3 --n_epochs $n_epochs --seed $seed --pretrain_n_epochs $pretrain_n_epochs --mem_dim $mem_dim --normal_class $i --gpu_id 1 --learning_approach $learning_approach >> $log_file"

        # } &
    done;


### Error messages and logs

| Name | Type | Params

0 | encoder | CIFAR10Encoder | 520 K 1 | decoder | CIFAR10Decoder | 284 K 2 | mse | MSELoss | 0

804 K Trainable params 0 Non-trainable params 804 K Total params 3.218 Total estimated model params size (MB) TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs Missing logger folder: /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224939/lightning_logs 2022-12-05 22:49:41.124533: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable TF_ENABLE_ONEDNN_OPTS=0. Trainer.fit stopped: max_epochs=1 reached. /home/zby/anaconda3/envs/machine-learning/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:134: UserWarning: .test(ckpt_path=None) was called without a model. The best model of the previous fit call will be used. You can pass .test(ckpt_path='best') to use the best model or .test(ckpt_path='last') to use the last model. If you pass a value, this warning will be silenced. rank_zero_warn( Restoring states from the checkpoint path at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224925/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1] Loaded model weights from checkpoint at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224925/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt Global seed set to 9 GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs Missing logger folder: /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224945/lightning_logs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

| Name | Type | Params

0 | encoder | CIFAR10Encoder | 520 K 1 | decoder | CIFAR10Decoder | 284 K 2 | mse | MSELoss | 0

| Name | Type | Params

0 | encoder | CIFAR10Encoder | 520 K 1 | decoder | CIFAR10Decoder | 284 K 2 | mse | MSELoss | 0

804 K Trainable params 0 Non-trainable params 804 K Total params 3.218 Total estimated model params size (MB) Trainer.fit stopped: max_epochs=1 reached. /home/zby/anaconda3/envs/machine-learning/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:134: UserWarning: .test(ckpt_path=None) was called without a model. The best model of the previous fit call will be used. You can pass .test(ckpt_path='best') to use the best model or .test(ckpt_path='last') to use the last model. If you pass a value, this warning will be silenced. rank_zero_warn( Restoring states from the checkpoint path at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224934/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1] Loaded model weights from checkpoint at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224934/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt Trainer.fit stopped: max_epochs=1 reached. /home/zby/anaconda3/envs/machine-learning/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:134: UserWarning: .test(ckpt_path=None) was called without a model. The best model of the previous fit call will be used. You can pass .test(ckpt_path='best') to use the best model or .test(ckpt_path='last') to use the last model. If you pass a value, this warning will be silenced. rank_zero_warn( Restoring states from the checkpoint path at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224939/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1] Loaded model weights from checkpoint at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224939/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt Trainer.fit stopped: max_epochs=1 reached. /home/zby/anaconda3/envs/machine-learning/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:134: UserWarning: .test(ckpt_path=None) was called without a model. The best model of the previous fit call will be used. You can pass .test(ckpt_path='best') to use the best model or .test(ckpt_path='last') to use the last model. If you pass a value, this warning will be silenced. rank_zero_warn( Restoring states from the checkpoint path at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224945/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1] Loaded model weights from checkpoint at /home/zby/Workspaces/anomaly-detection/bash-log/pl_svdd/2022-12-05-224945/lightning_logs/version_0/checkpoints/epoch=0-step=78.ckpt


### Environment

CUDA:
- GPU:
  - NVIDIA GeForce RTX 3080 Ti
  - NVIDIA GeForce RTX 3080 Ti
- available: True
- version: 11.2
Lightning:
- lightning: 1.9.0.dev0
- lightning-api-access: 0.0.1
- lightning-cloud: 0.5.11
- lightning-utilities: 0.3.0
- pytorch-lightning: 1.8.2
- torch: 1.12.1.post200
- torch-tb-profiler: 0.4.0
- torchaudio: 0.12.1
- torchmetrics: 0.10.3
- torchvision: 0.13.1
Packages:
- absl-py: 1.3.0
- aiobotocore: 2.4.0
- aiohttp: 3.8.3
- aioitertools: 0.11.0
- aiosignal: 1.3.1
- anyio: 3.6.2
- arrow: 1.2.3
- astunparse: 1.6.3
- async-timeout: 4.0.2
- attrs: 22.1.0
- autograd: 1.5
- beautifulsoup4: 4.11.1
- blessed: 1.19.1
- blinker: 1.5
- botocore: 1.27.59
- brotlipy: 0.7.0
- cached-property: 1.5.2
- cachetools: 5.2.0
- certifi: 2022.9.24
- cffi: 1.15.1
- charset-normalizer: 2.1.1
- click: 8.1.3
- commonmark: 0.9.1
- croniter: 1.3.7
- cryptography: 38.0.3
- cvxopt: 1.3.0
- cvxpy: 1.2.1
- cycler: 0.11.0
- deepdiff: 6.2.1
- dnspython: 2.2.1
- ecos: 2.0.10
- email-validator: 1.3.0
- fastapi: 0.87.0
- fire: 0.4.0
- flake8: 5.0.4
- flatbuffers: 1.12
- fonttools: 4.38.0
- frozenlist: 1.3.3
- fsspec: 2022.11.0
- future: 0.18.2
- gast: 0.4.0
- google-auth: 2.14.1
- google-auth-oauthlib: 0.4.6
- google-pasta: 0.2.0
- grpcio: 1.46.4
- h11: 0.14.0
- h5py: 3.7.0
- httpcore: 0.16.1
- httptools: 0.5.0
- httpx: 0.23.1
- idna: 3.4
- importlib-metadata: 5.0.0
- inquirer: 2.10.1
- itsdangerous: 2.1.2
- jinja2: 3.1.2
- jmespath: 1.0.1
- joblib: 1.2.0
- keras: 2.9.0
- keras-preprocessing: 1.1.2
- kiwisolver: 1.4.4
- lightning: 1.9.0.dev0
- lightning-api-access: 0.0.1
- lightning-cloud: 0.5.11
- lightning-utilities: 0.3.0
- markdown: 3.4.1
- markupsafe: 2.1.1
- matplotlib: 3.5.3
- mccabe: 0.7.0
- multidict: 6.0.2
- munkres: 1.1.4
- numpy: 1.23.4
- oauthlib: 3.2.2
- opencv-python: 4.6.0.66
- opt-einsum: 3.3.0
- ordered-set: 4.1.0
- orjson: 3.8.1
- osqp: 0.6.2.post0
- packaging: 21.3
- pandas: 1.4.4
- pillow: 9.2.0
- pip: 22.3.1
- ply: 3.11
- protobuf: 3.20.1
- psutil: 5.9.4
- pyasn1: 0.4.8
- pyasn1-modules: 0.2.7
- pycodestyle: 2.9.1
- pycparser: 2.21
- pydantic: 1.10.2
- pyflakes: 2.5.0
- pygments: 2.13.0
- pyjwt: 2.6.0
- pymanopt: 2.0.1
- pyopenssl: 22.1.0
- pyparsing: 3.0.9
- pyqt5: 5.15.7
- pyqt5-sip: 12.11.0
- pysocks: 1.7.1
- python-dateutil: 2.8.2
- python-dotenv: 0.21.0
- python-editor: 1.0.4
- python-multipart: 0.0.5
- pytorch-lightning: 1.8.2
- pytz: 2022.6
- pyu2f: 0.1.5
- pyyaml: 6.0
- qdldl: 0.1.5.post2
- readchar: 4.0.3
- requests: 2.28.1
- requests-oauthlib: 1.3.1
- rfc3986: 1.5.0
- rich: 12.6.0
- rsa: 4.9
- s3fs: 2022.11.0
- scikit-learn: 1.1.2
- scipy: 1.9.1
- scs: 3.2.0
- setuptools: 65.5.1
- sip: 6.7.4
- six: 1.16.0
- sniffio: 1.3.0
- soupsieve: 2.3.2.post1
- starlette: 0.21.0
- starsessions: 1.3.0
- tensorboard: 2.11.0
- tensorboard-data-server: 0.6.0
- tensorboard-plugin-wit: 1.8.1
- tensorboardx: 2.5.1
- tensorflow: 2.9.1
- tensorflow-estimator: 2.9.0
- termcolor: 2.1.0
- threadpoolctl: 3.1.0
- toml: 0.10.2
- torch: 1.12.1.post200
- torch-tb-profiler: 0.4.0
- torchaudio: 0.12.1
- torchmetrics: 0.10.3
- torchvision: 0.13.1
- tornado: 6.2
- tqdm: 4.64.1
- traitlets: 5.5.0
- typing-extensions: 4.4.0
- ujson: 5.5.0
- unicodedata2: 15.0.0
- urllib3: 1.26.11
- uvicorn: 0.20.0
- uvloop: 0.17.0
- watchfiles: 0.18.1
- wcwidth: 0.2.5
- websocket-client: 1.4.2
- websockets: 10.4
- werkzeug: 2.2.2
- wheel: 0.38.4
- wrapt: 1.14.1
- yapf: 0.32.0
- yarl: 1.8.1
- zipp: 3.10.0
System:
- OS: Linux
- architecture:
  - 64bit
  - ELF
- processor: x86_64
- python: 3.10.2
- version: #62-Ubuntu SMP Tue Nov 22 19:54:14 UTC 2022

More info

No response

Lightning-AI / pytorch-lightning