[Bug] （ipr_res50_dsnt-8xb64-210e_coco-256x256.py）unable to train

WinnerMeat commented 1 year ago

Prerequisite

[X] I have searched Issues and Discussions but cannot get the expected help.
[X] The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmpose).

Environment

OrderedDict([('sys.platform', 'win32'), ('Python', '3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 15:53:35) [MSC v.1929 64 bit (AMD64)]'), ('CUDA available', True), ('numpy_random_seed', 2147483648), ('GPU 0,1', 'NVIDIA GeForce RTX 3090'), ('CUDA_HOME', 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6'), ('NVCC', 'Cuda compilation tools, release 11.6, V11.6.55'), ('MSVC', '用于 x64 的 Microsoft (R) C/C++ 优化编译器 19.29.30148 版'), ('GCC', 'n/a'), ('PyTorch', '1.13.1+cu116'), ('PyTorch compiling details', 'PyTorch built with:\n - C++ Version: 199711\n - MSVC 192829337\n - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)\n - OpenMP 2019\n - LAPACK is enabled (usually provided by MKL)\n - CPU capability usage: AVX2\n - CUDA Runtime 11.6\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37\n - CuDNN 8.3.2 (built against CUDA 11.5)\n - Magma 2.5.4\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.6, CUDNN_VERSION=8.3.2, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /w /bigobj -DUSE_PTHREADPOOL -openmp:experimental -IC:/actions-runner/_work/pytorch/pytorch/builder/windows/mkl/include -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.13.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_ROCM=OFF, \n'), ('TorchVision', '0.14.1+cu116'), ('OpenCV', '4.7.0'), ('MMEngine', '0.7.3'), ('MMPose', '1.0.0+')])

Reproduces the problem - code sample

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp

from mmengine.config import Config, DictAction
from mmengine.runner import Runner

def parse_args():
    parser = argparse.ArgumentParser(description='Train a pose model')
    parser.add_argument('--config',default='./configs/body_2d_keypoint/integral_regression/crowdpose/ipr_res50_dsnt-8xb64-210e_coco-256x256.py' , help='train config file path')
    parser.add_argument('--work-dir', default='./DSNT_crowdpose/', help='the dir to save logs and models')
    parser.add_argument(
        '--resume',
        nargs='?',
        type=str,
        const='auto',
        help='If specify checkpint path, resume from it, while if not '
        'specify, try to auto resume from the latest checkpoint '
        'in the work directory.')
    parser.add_argument(
        '--amp',
        action='store_true',
        default=False,
        help='enable automatic-mixed-precision training')
    parser.add_argument(
        '--no-validate',
        default=False,
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    parser.add_argument(
        '--auto-scale-lr',
        default=True,
        action='store_true',
        help='whether to auto scale the learning rate according to the '
        'actual batch size and the original batch size.')
    parser.add_argument(
        '--show-dir',
        help='directory where the visualization images will be saved.')
    parser.add_argument(
        '--show',
        default=False,
        action='store_true',
        help='whether to display the prediction results in a window.')
    parser.add_argument(
        '--interval',
        type=int,
        default=1,
        help='visualize per interval samples.')
    parser.add_argument(
        '--wait-time',
        type=float,
        default=1,
        help='display time of every window. (second)')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
    # will pass the `--local-rank` parameter to `tools/train.py` instead
    # of `--local_rank`.
    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
    return args
def merge_args(cfg, args):
    """Merge CLI arguments to config."""
    if args.no_validate:
        cfg.val_cfg = None
        cfg.val_dataloader = None
        cfg.val_evaluator = None

    cfg.launcher = args.launcher

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])

    # enable automatic-mixed-precision training
    if args.amp is True:
        optim_wrapper = cfg.optim_wrapper.get('type', 'OptimWrapper')
        assert optim_wrapper in ['OptimWrapper', 'AmpOptimWrapper'], \
            '`--amp` is not supported custom optimizer wrapper type ' \
            f'`{optim_wrapper}.'
        cfg.optim_wrapper.type = 'AmpOptimWrapper'
        cfg.optim_wrapper.setdefault('loss_scale', 'dynamic')

    # resume training
    if args.resume == 'auto':
        cfg.resume = True
        cfg.load_from = None
    elif args.resume is not None:
        cfg.resume = True
        cfg.load_from = args.resume

    # enable auto scale learning rate
    if args.auto_scale_lr:
        cfg.auto_scale_lr.enable = True

    # visualization
    if args.show or (args.show_dir is not None):
        assert 'visualization' in cfg.default_hooks, \
            'PoseVisualizationHook is not set in the ' \
            '`default_hooks` field of config. Please set ' \
            '`visualization=dict(type="PoseVisualizationHook")`'

        cfg.default_hooks.visualization.enable = True
        cfg.default_hooks.visualization.show = args.show
        if args.show:
            cfg.default_hooks.visualization.wait_time = args.wait_time
        cfg.default_hooks.visualization.out_dir = args.show_dir
        cfg.default_hooks.visualization.interval = args.interval

    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    return cfg

def main():
    args = parse_args()

    # load config
    cfg = Config.fromfile(args.config)

    cfg.train_cfg.max_epochs = 100
    cfg.train_cfg.val_interval = 1

    cfg.train_dataloader.dataset.data_root = '../data_set/crowdpose/'
    cfg.val_dataloader.dataset.data_root = '../data_set/crowdpose/'

    cfg.train_dataloader.dataset.ann_file = 'annotations/mmpose_crowdpose_train.json'
    cfg.val_dataloader.dataset.ann_file = 'annotations/mmpose_crowdpose_test.json'
    cfg.val_evaluator.ann_file = '../data_set/crowdpose/annotations/mmpose_crowdpose_test.json'

    cfg.val_dataloader.dataset.bbox_file = '../data_set/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json'

    cfg.test_evaluator = cfg.val_evaluator
    cfg.test_dataloader = cfg.val_dataloader

    cfg.train_dataloader.batch_size = 2
    cfg.val_dataloader.batch_size = 2
    # merge CLI arguments to config
    cfg = merge_args(cfg, args)

    # set preprocess configs to model
    if 'preprocess_cfg' in cfg:
        cfg.model.setdefault('data_preprocessor',
                             cfg.get('preprocess_cfg', {}))

    # build the runner from config
    runner = Runner.from_cfg(cfg)

    # start training
    runner.train()

if __name__ == '__main__':
    main()

Reproduces the problem - command or script

python tools/train.py

Reproduces the problem - error message

06/13 10:32:53 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
06/13 10:32:53 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
06/13 10:32:53 - mmengine - INFO - Checkpoints will be saved to D:\userDocument\YS220854100371\mmpose\myresult\ipr_DSNT_crowdpose.
06/13 10:33:09 - mmengine - INFO - Epoch(train)   [1][  50/9001]  lr: 1.935512e-07  eta: 3 days, 6:49:19  time: 0.315271  data_time: 0.154798  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:13 - mmengine - INFO - Epoch(train)   [1][ 100/9001]  lr: 3.890594e-07  eta: 2 days, 1:35:11  time: 0.081421  data_time: 0.001673  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:17 - mmengine - INFO - Epoch(train)   [1][ 150/9001]  lr: 5.845676e-07  eta: 1 day, 15:36:11  time: 0.078574  data_time: 0.002390  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:21 - mmengine - INFO - Epoch(train)   [1][ 200/9001]  lr: 7.800758e-07  eta: 1 day, 10:40:37  time: 0.079629  data_time: 0.002648  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:25 - mmengine - INFO - Epoch(train)   [1][ 250/9001]  lr: 9.755840e-07  eta: 1 day, 7:29:52  time: 0.075169  data_time: 0.001852  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:29 - mmengine - INFO - Epoch(train)   [1][ 300/9001]  lr: 1.171092e-06  eta: 1 day, 5:27:31  time: 0.077100  data_time: 0.002310  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:33 - mmengine - INFO - Epoch(train)   [1][ 350/9001]  lr: 1.366600e-06  eta: 1 day, 3:59:55  time: 0.077021  data_time: 0.002031  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:36 - mmengine - INFO - Epoch(train)   [1][ 400/9001]  lr: 1.562109e-06  eta: 1 day, 2:51:16  time: 0.075448  data_time: 0.002011  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:40 - mmengine - INFO - Epoch(train)   [1][ 450/9001]  lr: 1.757617e-06  eta: 1 day, 2:03:39  time: 0.078932  data_time: 0.002589  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:44 - mmengine - INFO - Epoch(train)   [1][ 500/9001]  lr: 1.953125e-06  eta: 1 day, 1:23:08  time: 0.077319  data_time: 0.002250  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:48 - mmengine - INFO - Epoch(train)   [1][ 550/9001]  lr: 1.953125e-06  eta: 1 day, 0:48:40  time: 0.076364  data_time: 0.002031  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:52 - mmengine - INFO - Epoch(train)   [1][ 600/9001]  lr: 1.953125e-06  eta: 1 day, 0:20:38  time: 0.076921  data_time: 0.001932  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:33:56 - mmengine - INFO - Epoch(train)   [1][ 650/9001]  lr: 1.953125e-06  eta: 23:58:41  time: 0.078454  data_time: 0.003405  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:00 - mmengine - INFO - Epoch(train)   [1][ 700/9001]  lr: 1.953125e-06  eta: 23:36:21  time: 0.075189  data_time: 0.002748  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:03 - mmengine - INFO - Epoch(train)   [1][ 750/9001]  lr: 1.953125e-06  eta: 23:19:07  time: 0.077319  data_time: 0.001832  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:07 - mmengine - INFO - Epoch(train)   [1][ 800/9001]  lr: 1.953125e-06  eta: 23:03:20  time: 0.076583  data_time: 0.002190  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:11 - mmengine - INFO - Epoch(train)   [1][ 850/9001]  lr: 1.953125e-06  eta: 22:49:38  time: 0.076841  data_time: 0.001892  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:15 - mmengine - INFO - Epoch(train)   [1][ 900/9001]  lr: 1.953125e-06  eta: 22:38:16  time: 0.077817  data_time: 0.002071  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:19 - mmengine - INFO - Epoch(train)   [1][ 950/9001]  lr: 1.953125e-06  eta: 22:27:17  time: 0.076802  data_time: 0.004062  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:23 - mmengine - INFO - Exp name: ipr_res50_dsnt-8xb64-210e_coco-256x256_20230613_103233
06/13 10:34:23 - mmengine - INFO - Epoch(train)   [1][1000/9001]  lr: 1.953125e-06  eta: 22:15:47  time: 0.074671  data_time: 0.002628  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:26 - mmengine - INFO - Epoch(train)   [1][1050/9001]  lr: 1.953125e-06  eta: 22:08:07  time: 0.078494  data_time: 0.001932  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:30 - mmengine - INFO - Epoch(train)   [1][1100/9001]  lr: 1.953125e-06  eta: 22:01:26  time: 0.078932  data_time: 0.002310  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:35 - mmengine - INFO - Epoch(train)   [1][1150/9001]  lr: 1.953125e-06  eta: 21:59:03  time: 0.084647  data_time: 0.002489  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:39 - mmengine - INFO - Epoch(train)   [1][1200/9001]  lr: 1.953125e-06  eta: 21:53:41  time: 0.079569  data_time: 0.003127  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:43 - mmengine - INFO - Epoch(train)   [1][1250/9001]  lr: 1.953125e-06  eta: 21:48:53  time: 0.079788  data_time: 0.002071  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:47 - mmengine - INFO - Epoch(train)   [1][1300/9001]  lr: 1.953125e-06  eta: 21:48:25  time: 0.086698  data_time: 0.003086  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:51 - mmengine - INFO - Epoch(train)   [1][1350/9001]  lr: 1.953125e-06  eta: 21:43:03  time: 0.077797  data_time: 0.001971  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:55 - mmengine - INFO - Epoch(train)   [1][1400/9001]  lr: 1.953125e-06  eta: 21:36:10  time: 0.074253  data_time: 0.002031  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:34:58 - mmengine - INFO - Epoch(train)   [1][1450/9001]  lr: 1.953125e-06  eta: 21:30:53  time: 0.076443  data_time: 0.002350  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:02 - mmengine - INFO - Epoch(train)   [1][1500/9001]  lr: 1.953125e-06  eta: 21:25:26  time: 0.075428  data_time: 0.002091  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:06 - mmengine - INFO - Epoch(train)   [1][1550/9001]  lr: 1.953125e-06  eta: 21:20:54  time: 0.076602  data_time: 0.002688  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:10 - mmengine - INFO - Epoch(train)   [1][1600/9001]  lr: 1.953125e-06  eta: 21:16:58  time: 0.077260  data_time: 0.001872  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:14 - mmengine - INFO - Epoch(train)   [1][1650/9001]  lr: 1.953125e-06  eta: 21:13:51  time: 0.078554  data_time: 0.002409  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:18 - mmengine - INFO - Epoch(train)   [1][1700/9001]  lr: 1.953125e-06  eta: 21:10:00  time: 0.076503  data_time: 0.002370  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:21 - mmengine - INFO - Epoch(train)   [1][1750/9001]  lr: 1.953125e-06  eta: 21:06:19  time: 0.076364  data_time: 0.003007  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:26 - mmengine - INFO - Epoch(train)   [1][1800/9001]  lr: 1.953125e-06  eta: 21:05:31  time: 0.082815  data_time: 0.002509  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:29 - mmengine - INFO - Epoch(train)   [1][1850/9001]  lr: 1.953125e-06  eta: 21:01:44  time: 0.075348  data_time: 0.002131  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:33 - mmengine - INFO - Epoch(train)   [1][1900/9001]  lr: 1.953125e-06  eta: 20:59:07  time: 0.077857  data_time: 0.002230  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:37 - mmengine - INFO - Epoch(train)   [1][1950/9001]  lr: 1.953125e-06  eta: 20:56:53  time: 0.078454  data_time: 0.002210  memory: 805  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/13 10:35:41 - mmengine - INFO - Exp name: ipr_res50_dsnt-8xb64-210e_coco-256x256_20230613_103233

Additional information

I tried to use the crowdpose data set to run on the model (configs/body_2d_keypoint/integral_regression/crowdpose/ipr_res50_dsnt-8xb64-210e_coco-256x256.py), but the training results were all invalid NAN; in addition, I also used the COCO2017 data set in the Training on the model yields the same results. I tried to find the cause of the error and found that after the first gradient update, the weight values of the model were all NAN. I didn't find a solution to this problem, so I filed this bug, hoping it will be answered.

WinnerMeat commented 1 year ago

06/14 10:13:13 - mmengine - INFO - 
------------------------------------------------------------
System environment:
    sys.platform: win32
    Python: 3.8.16 | packaged by conda-forge | (default, Feb  1 2023, 15:53:35) [MSC v.1929 64 bit (AMD64)]
    CUDA available: True
    numpy_random_seed: 1034294422
    GPU 0,1: NVIDIA GeForce RTX 3090
    CUDA_HOME: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6
    NVCC: Cuda compilation tools, release 11.6, V11.6.55
    MSVC: 用于 x64 的 Microsoft (R) C/C++ 优化编译器 19.29.30148 版
    GCC: n/a
    PyTorch: 1.13.1+cu116
    PyTorch compiling details: PyTorch built with:
  - C++ Version: 199711
  - MSVC 192829337
  - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)
  - OpenMP 2019
  - LAPACK is enabled (usually provided by MKL)
  - CPU capability usage: AVX2
  - CUDA Runtime 11.6
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37
  - CuDNN 8.3.2  (built against CUDA 11.5)
  - Magma 2.5.4
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.6, CUDNN_VERSION=8.3.2, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /w /bigobj -DUSE_PTHREADPOOL -openmp:experimental -IC:/actions-runner/_work/pytorch/pytorch/builder/windows/mkl/include -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.13.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_ROCM=OFF, 

    TorchVision: 0.14.1+cu116
    OpenCV: 4.7.0
    MMEngine: 0.7.3

Runtime environment:
    cudnn_benchmark: False
    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}
    dist_cfg: {'backend': 'nccl'}
    seed: None
    Distributed launcher: none
    Distributed training: False
    GPU number: 1
------------------------------------------------------------

06/14 10:13:14 - mmengine - INFO - Config:
default_scope = 'mmpose'
default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=50),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(
        type='CheckpointHook',
        interval=10,
        save_best='crowdpose/AP',
        rule='greater'),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='PoseVisualizationHook', enable=False))
custom_hooks = [dict(type='SyncBuffersHook')]
env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'))
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
    type='PoseLocalVisualizer',
    vis_backends=[dict(type='LocalVisBackend')],
    name='visualizer')
log_processor = dict(
    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
log_level = 'INFO'
load_from = None
resume = False
backend_args = dict(backend='local')
train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
val_cfg = dict()
test_cfg = dict()
optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.0005))
param_scheduler = [
    dict(
        type='LinearLR', begin=0, end=500, start_factor=0.001, by_epoch=False),
    dict(
        type='MultiStepLR',
        begin=0,
        end=100,
        milestones=[170, 200],
        gamma=0.1,
        by_epoch=True)
]
auto_scale_lr = dict(base_batch_size=512, enable=True)
codec = dict(
    type='RegressionLabel',
    input_size=(256, 256),
    heatmap_size=(64, 64),
    sigma=2.0,
    normalize=True)
model = dict(
    type='TopdownPoseEstimator',
    data_preprocessor=dict(
        type='PoseDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True),
    backbone=dict(type='ResNet', depth=50),
    head=dict(
        type='DSNTHead',
        in_channels=2048,
        in_featuremap_size=(8, 8),
        num_joints=14,
        loss=dict(
            type='MultipleLossWrapper',
            losses=[
                dict(type='SmoothL1Loss', use_target_weight=True),
                dict(type='JSDiscretLoss', use_target_weight=True)
            ]),
        decoder=dict(
            type='RegressionLabel',
            input_size=(256, 256),
            heatmap_size=(64, 64),
            sigma=2.0,
            normalize=True)),
    test_cfg=dict(flip_test=True, shift_coords=True, shift_heatmap=True),
    init_cfg=dict(
        type='Pretrained',
        checkpoint=
        'https://download.openmmlab.com/mmpose/pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'
    ))
dataset_type = 'CrowdPoseDataset'
data_mode = 'topdown'
data_root = 'data/crowdpose/'
train_pipeline = [
    dict(type='LoadImage'),
    dict(type='GetBBoxCenterScale'),
    dict(type='RandomFlip', direction='horizontal'),
    dict(type='RandomHalfBody'),
    dict(type='RandomBBoxTransform'),
    dict(type='TopdownAffine', input_size=(256, 256)),
    dict(
        type='GenerateTarget',
        encoder=dict(
            type='RegressionLabel',
            input_size=(256, 256),
            heatmap_size=(64, 64),
            sigma=2.0,
            normalize=True)),
    dict(type='PackPoseInputs')
]
test_pipeline = [
    dict(type='LoadImage'),
    dict(type='GetBBoxCenterScale'),
    dict(type='TopdownAffine', input_size=(256, 256)),
    dict(type='PackPoseInputs')
]
train_dataloader = dict(
    batch_size=2,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=dict(
        type='CrowdPoseDataset',
        data_root='../data_set/crowdpose/',
        data_mode='topdown',
        ann_file='annotations/mmpose_crowdpose_train.json',
        data_prefix=dict(img='images/'),
        pipeline=[
            dict(type='LoadImage'),
            dict(type='GetBBoxCenterScale'),
            dict(type='RandomFlip', direction='horizontal'),
            dict(type='RandomHalfBody'),
            dict(type='RandomBBoxTransform'),
            dict(type='TopdownAffine', input_size=(256, 256)),
            dict(
                type='GenerateTarget',
                encoder=dict(
                    type='RegressionLabel',
                    input_size=(256, 256),
                    heatmap_size=(64, 64),
                    sigma=2.0,
                    normalize=True)),
            dict(type='PackPoseInputs')
        ]))
val_dataloader = dict(
    batch_size=2,
    num_workers=2,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
    dataset=dict(
        type='CrowdPoseDataset',
        data_root='../data_set/crowdpose/',
        data_mode='topdown',
        ann_file='annotations/mmpose_crowdpose_test.json',
        bbox_file=
        '../data_set/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
        data_prefix=dict(img='images/'),
        test_mode=True,
        pipeline=[
            dict(type='LoadImage'),
            dict(type='GetBBoxCenterScale'),
            dict(type='TopdownAffine', input_size=(256, 256)),
            dict(type='PackPoseInputs')
        ]))
test_dataloader = dict(
    batch_size=32,
    num_workers=2,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
    dataset=dict(
        type='CrowdPoseDataset',
        data_root='../data_set/crowdpose/',
        data_mode='topdown',
        ann_file='annotations/mmpose_crowdpose_test.json',
        bbox_file=
        '../data_set/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
        data_prefix=dict(img='images/'),
        test_mode=True,
        pipeline=[
            dict(type='LoadImage'),
            dict(type='GetBBoxCenterScale'),
            dict(type='TopdownAffine', input_size=(256, 256)),
            dict(type='PackPoseInputs')
        ]))
val_evaluator = dict(
    type='CocoMetric',
    ann_file='../data_set/crowdpose/annotations/mmpose_crowdpose_test.json')
test_evaluator = dict(
    type='CocoMetric',
    ann_file='../data_set/crowdpose/annotations/mmpose_crowdpose_test.json')
launcher = 'none'
work_dir = './myresult/ipr_DSNT_crowdpose/'

Ben-Louis commented 1 year ago

Since you use a very small batch size, we suggest using a lower learning rate accordingly. By the way, the small batch size will also result in the instability of BN layers

WinnerMeat commented 1 year ago

@Ben-Louis listened to your suggestions and tried using a large batch batch_size=64 with a reasonable learning rate, but still got the same result.

06/25 16:44:25 - mmengine - INFO - Epoch(train)   [1][ 50/282]  lr: 6.193637e-06  eta: 5:18:16  time: 0.678371  data_time: 0.236657  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:44:47 - mmengine - INFO - Epoch(train)   [1][100/282]  lr: 1.244990e-05  eta: 4:22:11  time: 0.441295  data_time: 0.075030  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:45:08 - mmengine - INFO - Epoch(train)   [1][150/282]  lr: 1.870616e-05  eta: 4:00:07  time: 0.421244  data_time: 0.040561  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:45:29 - mmengine - INFO - Epoch(train)   [1][200/282]  lr: 2.496242e-05  eta: 3:48:34  time: 0.418337  data_time: 0.027817  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:45:50 - mmengine - INFO - Epoch(train)   [1][250/282]  lr: 3.121869e-05  eta: 3:42:06  time: 0.424828  data_time: 0.046276  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:46:03 - mmengine - INFO - Exp name: ipr_res50_dsnt-8xb64-210e_coco-256x256_20230625_164329
06/25 16:46:20 - mmengine - INFO - Epoch(val)   [1][  50/1008]    eta: 0:05:20  time: 0.334142  data_time: 0.205168  memory: 8585  
06/25 16:48:56 - mmengine - INFO - Epoch(val)   [1][1000/1008]    eta: 0:00:01  time: 0.168518  data_time: 0.047292  memory: 1005  
06/25 16:49:00 - mmengine - INFO - Evaluating CocoMetric...
Loading and preparing results...
DONE (t=0.20s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *keypoints_crowd*
DONE (t=2.57s).
Accumulating evaluation results...
DONE (t=0.08s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] =  0.277
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] =  0.277
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] =  0.277
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] =  0.272
 Average Recall     (AR) @[ IoU=0.50      | area=   all | maxDets= 20 ] =  0.272
 Average Recall     (AR) @[ IoU=0.75      | area=   all | maxDets= 20 ] =  0.272
 Average Precision  (AP) @[ IoU=0.50:0.95 | type=  easy | maxDets= 20 ] = 0.416
 Average Precision  (AP) @[ IoU=0.50:0.95 | type=medium | maxDets= 20 ] = 0.257
 Average Precision  (AP) @[ IoU=0.50:0.95 | type=  hard | maxDets= 20 ] = 0.247
06/25 16:49:06 - mmengine - INFO - Epoch(val) [1][1008/1008]    crowdpose/AP: 0.277228  crowdpose/AP .5: 0.277228  crowdpose/AP .75: 0.277228  crowdpose/AR: 0.272014  crowdpose/AR .5: 0.272014  crowdpose/AR .75: 0.272014  crowdpose/AP(E): 0.415800  crowdpose/AP(M): 0.257400  crowdpose/AP(H): 0.247500  data_time: 0.048809  time: 0.172888
06/25 16:49:09 - mmengine - INFO - The best checkpoint with 0.2772 crowdpose/AP at 1 epoch is saved to best_crowdpose_AP_epoch_1.pth.
06/25 16:49:31 - mmengine - INFO - Epoch(train)   [2][ 50/282]  lr: 4.147896e-05  eta: 3:35:30  time: 0.440419  data_time: 0.074551  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000
06/25 16:49:52 - mmengine - INFO - Epoch(train)   [2][100/282]  lr: 4.773522e-05  eta: 3:32:49  time: 0.426083  data_time: 0.047690  memory: 8585  loss: nan  loss_kpt: nan  acc_pose: 0.000000

Ben-Louis commented 1 year ago

Sorry but I tried DSNT on CrowdPose with batch size 64 and GPU 1 and 8, and found that the training loss is normal. Did you modify any code?

WinnerMeat commented 1 year ago

@Ben-Louis OrderedDict([('sys.platform', 'win32'), ('Python', '3.8.16 (default, Jun 12 2023, 21:00:42) [MSC v.1916 64 bit (AMD64)]'), ('CUDA available', True), ('numpy_random_seed', 2147483648), ('GPU 0,1', 'NVIDIA GeForce RTX 3080'), ('CUDA_HOME', 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4'), ('NVCC', 'Cuda compilation tools, release 11.4, V11.4.120'), ('MSVC', '用于 x64 的 Microsoft (R) C/C++ 优化编译器 19.33.31630 版'), ('GCC', 'n/a'), ('PyTorch', '1.13.1+cu116'), ('PyTorch compiling details', 'PyTorch built with:\n - C++ Version: 199711\n - MSVC 192829337\n - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)\n - OpenMP 2019\n - LAPACK is enabled (usually provided by MKL)\n - CPU capability usage: AVX2\n - CUDA Runtime 11.6\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37\n - CuDNN 8.3.2 (built against CUDA 11.5)\n - Magma 2.5.4\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.6, CUDNN_VERSION=8.3.2, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /w /bigobj -DUSE_PTHREADPOOL -openmp:experimental -IC:/actions-runner/_work/pytorch/pytorch/builder/windows/mkl/include -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.13.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_ROCM=OFF, \n'), ('TorchVision', '0.14.1+cu116'), ('OpenCV', '4.7.0'), ('MMEngine', '0.7.4'), ('MMPose', '1.0.0+unknown')])

_base_ = ['../../../_base_/default_runtime.py']

# runtime
train_cfg = dict(max_epochs=100, val_interval=1)

# optimizer
optim_wrapper = dict(optimizer=dict(
    type='Adam',
    lr=5e-4,
))

# learning policy
param_scheduler = [
    dict(
        type='LinearLR', begin=0, end=500, start_factor=0.001,
        by_epoch=False),  # warm-up
    dict(
        type='MultiStepLR',
        begin=0,
        end=train_cfg['max_epochs'],
        milestones=[170, 200],
        gamma=0.1,
        by_epoch=True)
]

# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)

# codec settings
codec = dict(
    type='IntegralRegressionLabel',
    input_size=(256, 256),
    heatmap_size=(64, 64),
    sigma=2.0,
    normalize=True)

# model settings
model = dict(
    type='TopdownPoseEstimator',
    data_preprocessor=dict(
        type='PoseDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True),
    backbone=dict(
        type='ResNet',
        depth=50,
    ),
    head=dict(
        type='DSNTHead',
        in_channels=2048,
        in_featuremap_size=(8, 8),
        num_joints=14,
        loss=dict(
            type='MultipleLossWrapper',
            losses=[
                dict(type='SmoothL1Loss', use_target_weight=True),
                dict(type='JSDiscretLoss', use_target_weight=True)
            ]),
        decoder=codec),
    test_cfg=dict(
        flip_test=True,
        shift_coords=True,
        shift_heatmap=True,
    ),
    init_cfg=dict(
        type='Pretrained',
        checkpoint='https://download.openmmlab.com/mmpose/'
        'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))

# base dataset settings
dataset_type = 'CrowdPoseDataset'
data_mode = 'topdown'
data_root = '../data_set/crowdpose/'

# pipelines
train_pipeline = [
    dict(type='LoadImage'),
    dict(type='GetBBoxCenterScale'),
    dict(type='RandomFlip', direction='horizontal'),
    dict(type='RandomHalfBody'),
    dict(type='RandomBBoxTransform'),
    dict(type='TopdownAffine', input_size=codec['input_size']),
    dict(type='GenerateTarget', encoder=codec),
    dict(type='PackPoseInputs')
]
test_pipeline = [
    dict(type='LoadImage'),
    dict(type='GetBBoxCenterScale'),
    dict(type='TopdownAffine', input_size=codec['input_size']),
    dict(type='PackPoseInputs')
]

# data loaders
train_dataloader = dict(
    batch_size=64,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        data_mode=data_mode,
        ann_file='annotations/mmpose_crowdpose_train.json',
        data_prefix=dict(img='images/'),
        pipeline=train_pipeline,
    ))
val_dataloader = dict(
    batch_size=32,
    num_workers=2,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        data_mode=data_mode,
        ann_file='annotations/mmpose_crowdpose_test.json',
        bbox_file='../data_set/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
        data_prefix=dict(img='images/'),
        test_mode=True,
        pipeline=test_pipeline,
    ))
test_dataloader = val_dataloader

# hooks
default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))

# evaluators
val_evaluator = dict(
    type='CocoMetric',
    ann_file='../data_set/crowdpose/annotations/mmpose_crowdpose_test.json',
    use_area=False,
    iou_type='keypoints_crowd',
    prefix='crowdpose')
test_evaluator = val_evaluator

I didn't change the code, I assumed it was a machine problem, but when I tested it again on a new machine, 
I got the same result, which was my machine environment and configuration file. 
Not sure if there is another workaround.

Ben-Louis commented 1 year ago

Maybe you could try to replace 'annotations/mmpose_crowdpose_train.json' with 'annotations/mmpose_crowdpose_trainval.json'? We generally use the latter to train models on CrowdPose

open-mmlab / mmpose