open-mmlab / mmdetection3d

OpenMMLab's next-generation platform for general 3D object detection.
https://mmdetection3d.readthedocs.io/en/latest/
Apache License 2.0
5k stars 1.49k forks source link

[Bug] gpu memory normal but one of gpus GPU-Util is 0 and model cant train #2954

Open xiongda777 opened 2 months ago

xiongda777 commented 2 months ago

Prerequisite

Task

I have modified the scripts/configs, or I'm working on my own tasks/models/datasets.

Branch

main branch https://github.com/open-mmlab/mmdetection3d

Environment

none

Reproduces the problem - code sample

import argparse import copy import os import random import time

import numpy as np import torch from mmcv import Config

from torchpack import distributed as dist

from torchpack.environ import auto_set_run_dir, set_run_dir

from torchpack.utils.config import configs

from mmdet3d.apis import train_model from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model from mmdet3d.utils import get_root_logger, convert_sync_batchnorm, recursive_eval

def main():

dist.init()

parser = argparse.ArgumentParser()
parser.add_argument("config", metavar="FILE", help="config file")
parser.add_argument("--run-dir", metavar="DIR", help="run directory")
args, opts = parser.parse_known_args()

# configs.load(args.config, recursive=True)
# configs.update(opts)
# cfg = Config(recursive_eval(configs), filename=args.config)

cfg = Config.fromfile(args.config)
torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
torch.cuda.set_device(0)

if args.run_dir is None:
    args.run_dir = auto_set_run_dir()
else:
    set_run_dir(args.run_dir)
cfg.run_dir = args.run_dir

# dump config
cfg.dump(os.path.join(cfg.run_dir, "configs.yaml"))

# init the logger before other steps
timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
log_file = os.path.join(cfg.run_dir, f"{timestamp}.log")
logger = get_root_logger(log_file=log_file)

# log some basic info
logger.info(f"Config:\n{cfg.pretty_text}")

# set random seeds
if cfg.seed is not None:
    logger.info(
        f"Set random seed to {cfg.seed}, "
        f"deterministic mode: {cfg.deterministic}"
    )
    random.seed(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    if cfg.deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

datasets = [build_dataset(cfg.data.train)]

model = build_model(cfg.model)
model.init_weights()
if cfg.get("sync_bn", None):
    if not isinstance(cfg["sync_bn"], dict):
        cfg["sync_bn"] = dict(exclude=[])
    model = convert_sync_batchnorm(model, exclude=cfg["sync_bn"]["exclude"])

logger.info(f"Model:\n{model}")
train_model(
    model,
    datasets,
    cfg,
    distributed=True,
    validate=True,
    timestamp=timestamp,
)

if name == "main": main()

Reproduces the problem - command or script

torchpack dist-run -np 8 python tools/train.py configs-90m-0.12/nuscenes/det/transfusion/secfpn/camera+lidar/resnet50/convfuser.yaml --model.encoders.camera.backbone.init_cfg.checkpoint /media/disk1/WINDKISS/projects/bevfusion-main/pretrained/resnet50-0676ba61.pth --load_from pretrained/lidar-only-det.pth --resume_from /media/disk1/WINDKISS/projects/bevfusion-main/runs/run-742dda1d/epoch_11.pth

Reproduces the problem - error message

2024-04-13_21-34-22

Additional information

how to fix the issue?