Open gitleej opened 10 months ago
When I train yolox using RTX4090, the CPU usage is very low.Only two cores are used. And the GPU usage also low,only used much GPU memory. How to improve the CPU and GPU utilization? Does dataloader use GPU or CPU by default?
System info
01/10 09:17:55 - mmengine - INFO - ------------------------------------------------------------ System environment: sys.platform: linux Python: 3.8.18 (default, Sep 11 2023, 13:40:15) [GCC 11.2.0] CUDA available: True numpy_random_seed: 580425004 GPU 0,1: NVIDIA GeForce RTX 4090 CUDA_HOME: /usr/local/cuda NVCC: Cuda compilation tools, release 11.3, V11.3.109 GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 PyTorch: 1.12.0 PyTorch compiling details: PyTorch built with: - GCC 9.3 - C++ Version: 201402 - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815) - OpenMP 201511 (a.k.a. OpenMP 4.5) - LAPACK is enabled (usually provided by MKL) - NNPACK is enabled - CPU capability usage: AVX2 - CUDA Runtime 11.3 - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37 - CuDNN 8.3.2 (built against CUDA 11.5) - Magma 2.5.2 - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.3.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.12.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF,
TorchVision: 0.13.0
OpenCV: 4.9.0
MMEngine: 0.10.2
# config file
```python
# _base_ = './yolox_s_8xb8-300e_dome_down_voc.py'
_base_ = [
'../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
'./yolox_tta.py'
]
img_scale = (640, 640) # width, height
# model settings
model = dict(
type='YOLOX',
data_preprocessor=dict(
type='DetDataPreprocessor',
pad_size_divisor=32,
# mean=[187.17040133180873, 186.87045380583902, 186.66486354591567],
# std=[1.0708850375722379, 2.0779975556038033, 1.8805358214173786],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(320, 640),
size_divisor=32,
interval=10)
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=0.33,
widen_factor=0.375,
# deepen_factor=0.33,
# widen_factor=0.5,
out_indices=(2, 3, 4),
use_depthwise=False,
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
),
neck=dict(
type='YOLOXPAFPN',
in_channels=[96, 192, 384],
out_channels=96,
# in_channels=[128, 256, 512],
# out_channels=128,
num_csp_blocks=1,
use_depthwise=False,
upsample_cfg=dict(scale_factor=2, mode='nearest'),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
bbox_head=dict(
type='YOLOXHead',
num_classes=1,
in_channels=96,
feat_channels=96,
# in_channels=128,
# feat_channels=128,
stacked_convs=2,
strides=(8, 16, 32),
use_depthwise=False,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_obj=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0),
loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
# In order to align the source code, the threshold of the val phase is
# 0.01, and the threshold of the test phase is 0.001.
test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
# model settings
# model = dict(
# data_preprocessor=dict(batch_augments=[
# dict(
# type='BatchSyncRandomResize',
# random_size_range=(320, 640),
# size_divisor=32,
# interval=10)
# ]),
# backbone=dict(deepen_factor=0.33, widen_factor=0.375),
# neck=dict(in_channels=[96, 192, 384], out_channels=96),
# bbox_head=dict(in_channels=96, feat_channels=96))
# dataset settings
# data_root = 'E:\\dataset\\02-WTGK\\dome_down_voc'
data_root = './dataset/01-WTGK/dome_down_voc'
dataset_type = 'VOCDataset'
metainfo = {
'classes':
('test',),
# palette is a list of color tuples, which is used for visualization.
'palette': [(106, 0, 228), ]
}
# Example to use different file client
# Method 1: simply set the data root and let the file I/O module
# automatically infer from prefix (not support LMDB and Memcache yet)
# data_root = 's3://openmmlab/datasets/detection/coco/'
# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
# backend_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/': 's3://openmmlab/datasets/detection/',
# 'data/': 's3://openmmlab/datasets/detection/'
# }))
backend_args = None
train_pipeline = [
dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
dict(
type='RandomAffine',
scaling_ratio_range=(0.5, 1.5),
# img_scale is (width, height)
border=(-img_scale[0] // 2, -img_scale[1] // 2)),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip', prob=0.5),
# Resize and Pad are for the last 15 epochs when Mosaic and
# RandomAffine are closed by YOLOXModeSwitchHook.
dict(type='Resize', scale=img_scale, keep_ratio=False),
dict(
type='Pad',
pad_to_square=True,
pad_val=dict(img=(114.0, 114.0, 114.0))),
dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
dict(type='PackDetInputs')
]
train_dataset = dict(
# use MultiImageMixDataset wrapper to support mosaic and mixup
type='MultiImageMixDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
metainfo=metainfo,
ann_file='ImageSets/Main/train.txt',
data_prefix=dict(sub_data_root="", img='JPEGImages'),
# sub_data_root="VOC2007",
pipeline=[
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='LoadAnnotations', with_bbox=True)
],
filter_cfg=dict(filter_empty_gt=False, min_size=32),
backend_args=backend_args),
pipeline=train_pipeline)
test_pipeline = [
# dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
dict(type='LoadImageFromFile', backend_args=backend_args),
# dict(type='Resize', scale=(416, 416), keep_ratio=False),
dict(type='Resize', scale=img_scale, keep_ratio=False),
dict(
type='Pad',
pad_to_square=True,
pad_val=dict(img=(114.0, 114.0, 114.0))),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor'))
]
train_dataloader = dict(
batch_size=48,
num_workers=48,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
val_dataloader = dict(
batch_size=48,
num_workers=48,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
metainfo=metainfo,
ann_file='ImageSets/Main/test.txt',
data_prefix=dict(sub_data_root="", img='JPEGImages'),
test_mode=True,
pipeline=test_pipeline,
backend_args=backend_args))
test_dataloader = val_dataloader
val_evaluator = dict(
type='VOCMetric',
metric='mAP',
collect_device="gpu"
)
test_evaluator = val_evaluator
# training settings
max_epochs = 300
num_last_epochs = 15
interval = 10
train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
# optimizer
# default 8 gpu
base_lr = 0.01
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4,
nesterov=True),
paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
# learning rate
param_scheduler = [
dict(
# use quadratic formula to warm up 5 epochs
# and lr is updated by iteration
# TODO: fix default scope in get function
type='mmdet.QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
# use cosine lr from 5 to 285 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=5,
T_max=max_epochs - num_last_epochs,
end=max_epochs - num_last_epochs,
by_epoch=True,
convert_to_iter_based=True),
dict(
# use fixed lr during last 15 epochs
type='ConstantLR',
by_epoch=True,
factor=1,
begin=max_epochs - num_last_epochs,
end=max_epochs,
)
]
default_hooks = dict(
checkpoint=dict(
interval=interval,
max_keep_ckpts=3 # only keep latest 3 checkpoints
))
custom_hooks = [
dict(
type='YOLOXModeSwitchHook',
num_last_epochs=num_last_epochs,
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0001,
update_buffers=True,
priority=49)
]
# NOTE: `auto_scale_lr` is for automatically scaling LR,
# USER SHOULD NOT CHANGE ITS VALUES.
# base_batch_size = (8 GPUs) x (8 samples per GPU)
auto_scale_lr = dict(base_batch_size=64)
is there any solution?@jbwang1997
My guess is num_workers set too highs
Could you figure out how to improve cpu and gpu utilization?
Could you figure out how to improve cpu and gpu utilization?
No solution found.
My guess is num_workers set too highs
My guess is num_workers set too highs
I have set it to 2. Still the same result.
When I train yolox using RTX4090, the CPU usage is very low.Only two cores are used. And the GPU usage also low,only used much GPU memory. How to improve the CPU and GPU utilization? Does dataloader use GPU or CPU by default?