import argparse
import logging
import os
import os.path as osp
from mmdet.utils import register_all_modules as register_all_modules_mmdet
from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
# from mmengine.evaluator import BaseMetric
# from mmengine.model import BaseModel
from mmengine.runner._flexible_runner import FlexibleRunner
from mmrotate.utils import register_all_modules
from mmengine.device import (is_cuda_available, is_mlu_available,
is_npu_available)
import torch
if torch.__version__ >= '1.8':
import torch_npu
from torch_npu.npu import amp
import transfer_to_npu
def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('--config',
# default='configs/ascend/ringsee_dota.py',
# default='configs/ascend/rotated-fcos-hbox-le90_swin-mix_fpn_1x_dota.py',
# default='configs/ascend/rotated-fcos-hbox-le90_r50_fpn_1x_dota.py',
default='configs/ascend/rotated_rtmdet/rotated_rtmdet_hbox_l-3x-dota.py',
help='train config file path')
parser.add_argument('--work-dir',
# default='work_dirs/rotated-fcos-hbox-le90_swin-mix_fpn-amp_1x_dota',
default="work_dirs/tmp",
help='the dir to save logs and models')
parser.add_argument(
'--amp',
action='store_true',
default=True,
help='enable automatic-mixed-precision training')
parser.add_argument(
'--auto-scale-lr',
action='store_true',
help='enable automatically scaling LR.')
parser.add_argument(
'--resume',
action='store_true',
help='resume from the latest checkpoint in the work_dir automatically')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
assert is_cuda_available() or is_npu_available() or is_mlu_available(
), ('``GPU, NPU and MLU`` is unavailable during training ')
# register all modules in mmdet into the registries
# do not init the default scope here because it will be init in the runner
register_all_modules_mmdet(init_default_scope=False)
register_all_modules(init_default_scope=False)
# load config
cfg = Config.fromfile(args.config)
cfg.launcher = args.launcher
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
# enable automatic-mixed-precision training
if args.amp is True:
optim_wrapper = cfg.optim_wrapper.type
if optim_wrapper == 'AmpOptimWrapper':
print_log(
'AMP training is already enabled in your config.',
logger='current',
level=logging.WARNING)
else:
assert optim_wrapper == 'OptimWrapper', (
'`--amp` is only supported when the optimizer wrapper type is '
f'`OptimWrapper` but got {optim_wrapper}.')
cfg.optim_wrapper.type = 'AmpOptimWrapper'
cfg.optim_wrapper.loss_scale = 'dynamic'
# enable automatically scaling LR
if args.auto_scale_lr:
if 'auto_scale_lr' in cfg and \
'enable' in cfg.auto_scale_lr and \
'base_batch_size' in cfg.auto_scale_lr:
cfg.auto_scale_lr.enable = True
else:
raise RuntimeError('Can not find "auto_scale_lr" or '
'"auto_scale_lr.enable" or '
'"auto_scale_lr.base_batch_size" in your'
' configuration file.')
cfg.resume = args.resume
if cfg.launcher!=None:
model_wrapper_cfg=dict(model_wrapper_cfg=
dict(type='MMDistributedDataParallel', find_unused_parameters=True))
cfg.merge_from_dict(model_wrapper_cfg)
# build the runner from config
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)
# start training
runner.train()
if __name__ == '__main__':
main()
Prerequisite
Environment
使用的是华为Ascendhub的官方Docker镜像:ascend-pytorch:23.0.RC3-1.11.0 (https://ascendhub.huawei.com/#/detail/ascend-pytorch), torch1.11.0 , CANN为7.0.RC1,系统架构为aarch64,显卡信息为910B
具体信息为: OrderedDict([('sys.platform', 'linux'), ('Python', '3.7.5 (default, Oct 28 2023, 09:08:27) [GCC 7.5.0]'), ('CUDA available', False), ('numpy_random_seed', 2147483648), ('GCC', 'gcc (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0'), ('PyTorch', '1.11.0a0+gitbc2c6ed'), ('PyTorch compiling details', 'PyTorch built with:\n - GCC 7.5\n - C++ Version: 201402\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - NNPACK is enabled\n - CPU capability usage: NO AVX\n - Build settings: BUILD_TYPE=Release, CXX_COMPILER=/opt/buildtools/gcc-7.5.0/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -DMISSING_ARM_VST1 -DMISSING_ARM_VLD1 -Wno-stringop-overflow, TORCH_VERSION=1.11.0, USE_CUDA=OFF, USE_CUDNN=OFF, USE_EIGEN_FOR_BLAS=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=OFF, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, \n'), ('TorchVision', '0.12.0'), ('OpenCV', '4.8.1'), ('MMEngine', '0.10.2'), ('MMCV', '2.1.0'), ('MMCV Compiler', 'GCC 7.5'), ('MMCV CUDA Compiler', 'not available')])
Reproduces the problem - code sample
Reproduces the problem - command or script
基于mmrotate的dev-1.x分支代码,在aarch64架构下的910B显卡上进行oriented-rcnn模型的训练(手动移除了mmrotate-dev1.x对mmcv的版本依赖关系判断代码)
python /workspace/mmrotate/tools/train.py --config=configs/oriented-rcnn/oriented-rcnn-le90_r50_fpn_1x_dota.py --work-dir=./work_dir/oriented-rcnn-le90_r50_fpn_1x_dota --amp
, 其中train.py
里使用的华为的代码自动迁移工具,需要额外导入Reproduces the problem - error message
Additional information
注意到mmcv-main分支下最近新增加了NPU设备上的
RoIAlignRotated
算子适配,但是实际测试mmrotate的旋转目标检测算法oriented-RCNN
时发现仍存在问题,不支持该算子的梯度求导操作。此外,在mmrotate的20多种旋转目标检测算法中,有一半算法都用的是IoUloss系列(RotatedIoULoss && IoULoss
),涉及到mmcv里的MinAreaPolygon
ConvexIoU
DiffIoURotated
这几个算子均没有进行NPU设备上的适配,导致mmrotate的算法在NPU上只有1-2个能跑通的模型。希望团队能将NPU设备上的这些常用算子适配提上日程