open-mmlab / mmpretrain

OpenMMLab Pre-training Toolbox and Benchmark
https://mmpretrain.readthedocs.io/en/latest/
Apache License 2.0
3.45k stars 1.07k forks source link

When I use this tool to train a new datasets, I get the "nvrtc: error: invalid value for --gpu-architecture (-arch)"[Bug] #1510

Open Yu-zhengbo opened 1 year ago

Yu-zhengbo commented 1 year ago

分支

main 分支 (mmpretrain 版本)

描述该错误

Traceback (most recent call last): File "D:\cvcode\mmpretrain\tools\train.py", line 159, in main() File "D:\cvcode\mmpretrain\tools\train.py", line 155, in main runner.train() File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\runner\runner.py", line 1692, in train self._init_model_weights() File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\runner\runner.py", line 899, in _init_model_weights model.init_weights() File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\base_module.py", line 124, in init_weights initialize(self, other_cfgs) File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 610, in initialize _initialize(module, cp_cfg) File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 518, in _initialize func(module) File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 330, in call module.apply(init) File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply module.apply(fn) File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply module.apply(fn) File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply module.apply(fn) [Previous line repeated 5 more times] File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 669, in apply fn(self) File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 327, in init trunc_normal_init(m, self.mean, self.std, self.a, self.b, File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 79, in trunc_normal_init truncnormal(module.weight, mean, std, a, b) # type: ignore File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 682, in truncnormal return _no_grad_truncnormal(tensor, mean, std, a, b) File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 649, in _no_grad_truncnormal tensor.erfinv_() RuntimeError:

define POS_INFINITY __int_as_float(0x7f800000)

define INFINITY POS_INFINITY

define NEG_INFINITY __int_as_float(0xff800000)

define NAN __int_as_float(0x7fffffff)

typedef long long int int64_t; typedef unsigned int uint32_t; typedef signed char int8_t; typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char" typedef short int16_t; static_assert(sizeof(int64_t) == 8, "expected size does not match"); static_assert(sizeof(uint32_t) == 4, "expected size does not match"); static_assert(sizeof(int8_t) == 1, "expected size does not match"); constexpr int num_threads = 128; constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live constexpr int block_work_size = thread_work_size * num_threads; //TODO use _assert_fail, because assert is disabled in non-debug builds

define ERROR_UNSUPPORTED_CAST assert(false);

namespace std {

using ::signbit; using ::isfinite; using ::isinf; using ::isnan;

using ::abs;

using ::acos; using ::acosf; using ::asin; using ::asinf; using ::atan; using ::atanf; using ::atan2; using ::atan2f; using ::ceil; using ::ceilf; using ::cos; using ::cosf; using ::cosh; using ::coshf;

using ::exp; using ::expf;

using ::fabs; using ::fabsf; using ::floor; using ::floorf;

using ::fmod; using ::fmodf;

using ::frexp; using ::frexpf; using ::ldexp; using ::ldexpf;

using ::log; using ::logf;

using ::log10; using ::log10f; using ::modf; using ::modff;

using ::pow; using ::powf;

using ::sin; using ::sinf; using ::sinh; using ::sinhf;

using ::sqrt; using ::sqrtf; using ::tan; using ::tanf;

using ::tanh; using ::tanhf;

using ::acosh; using ::acoshf; using ::asinh; using ::asinhf; using ::atanh; using ::atanhf; using ::cbrt; using ::cbrtf;

using ::copysign; using ::copysignf;

using ::erf; using ::erff; using ::erfc; using ::erfcf; using ::exp2; using ::exp2f; using ::expm1; using ::expm1f; using ::fdim; using ::fdimf; using ::fmaf; using ::fma; using ::fmax; using ::fmaxf; using ::fmin; using ::fminf; using ::hypot; using ::hypotf; using ::ilogb; using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; using ::log1p; using ::log1pf; using ::log2; using ::log2f; using ::logb; using ::logbf; using ::lrint; using ::lrintf; using ::lround; using ::lroundf;

using ::nan; using ::nanf;

using ::nearbyint; using ::nearbyintf; using ::nextafter; using ::nextafterf; using ::remainder; using ::remainderf; using ::remquo; using ::remquof; using ::rint; using ::rintf; using ::round; using ::roundf; using ::scalbln; using ::scalblnf; using ::scalbn; using ::scalbnf; using ::tgamma; using ::tgammaf; using ::trunc; using ::truncf;

} // namespace std

// NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. // Note, some types have ctype as void because we don't support them in codegen

define AT_FORALL_SCALAR_TYPES_WITHCOMPLEX() \

_(uint8t, Byte) / 0 / \ (int8t, Char) / 1 / \ (int16t, Short) / 2 / \ (int, Int) / 3 / \ _(int64t, Long) / 4 / \ (at::Half, Half) / 5 / \ (float, Float) / 6 / \ (double, Double) / 7 / \ (std::complex, ComplexHalf) / 8 / \ (std::complex, ComplexFloat) / 9 / \ (std::complex, ComplexDouble) / 10 / \ (bool, Bool) / 11 / \ (void, QInt8) / 12 / \ (void, QUInt8) / 13 / \ (void, QInt32) / 14 / \ (at::BFloat16, BFloat16) / 15 / \

define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPTQINT() \

_(uint8t, Byte) \ (int8t, Char) \ (int16t, Short) \ (int, Int) \ _(int64t, Long) \ (at::Half, Half) \ (float, Float) \ (double, Double) \ (std::complex, ComplexHalf) \ (std::complex, ComplexFloat) \ (std::complex, ComplexDouble) \ (bool, Bool) \ _(at::BFloat16, BFloat16)

enum class ScalarType : int8_t {

define DEFINE_ENUM(_1, n) n,

AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)

undef DEFINE_ENUM

  Undefined,

NumOptions };

template <typename T, int size> struct Array { T data[size];

device T operator[](int i) const { return data[i]; } device T& operator[](int i) { return data[i]; } Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; device Array(T x) { for (int i = 0; i < size; i++) { data[i] = x; } } };

template device inline scalar_t load(char base_ptr, uint32_t offset) { return (reinterpret_cast<scalar_t*>(base_ptr) + offset); }

template device inline void store(scalar_t value, char base_ptr, uint32_t offset) { (reinterpret_cast<scalar_t *>(base_ptr) + offset) = value; }

// aligned vector generates vectorized load/store on CUDA template<typename scalar_t, int vec_size> struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { scalar_t val[vec_size]; };

template T erfinv_kernel(T a) { return erfinv(a); }

// TODO: setup grid-stride loop

extern "C" global void erfinv_kernel_vectorized4_kernel( const int N, Array<char, 1+1> data, float scalar_val) //[1+1], { constexpr int vec_size = 4; int remaining = N - block_work_size blockIdx.x; auto thread_idx = threadIdx.x; int idx = blockIdx.x; float arg0[4];

  float out0[4];

  if (remaining < block_work_size) {
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++){
      if (thread_idx >= remaining) {
        break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      arg0[j] = load<float>(data[1], linear_idx);

      thread_idx += num_threads;
    }
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if ((threadIdx.x  + j*num_threads) < remaining) {
        out0[j] = erfinv_kernel<float>(arg0[j] );
      }
    }
    thread_idx = threadIdx.x;
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if (thread_idx >= remaining) {
          break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      store<float>(out0[j], data[0], linear_idx);

      thread_idx += num_threads;
    }
  } else {
    static constexpr int loop_size = thread_work_size / vec_size;

//actual loading using vec_t_input = aligned_vector<float, vec_size>; vec_t_input vec0 = reinterpret_cast<vec_t_input >(data[0+1]) + block_work_size / vec_size * idx;

    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_input v;
      v = vec0[thread_idx];
      #pragma unroll
      for (int j=0; j < vec_size; j++){
        arg0[vec_size * i + j] = v.val[j];
      }

      thread_idx += num_threads;
    }

    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      out0[j] = erfinv_kernel<float>(arg0[j] );
    }

    using vec_t_output = aligned_vector<float, vec_size>;
    vec_t_output* to_0 = reinterpret_cast<vec_t_output*>(data[0]) + block_work_size / vec_size * idx;

    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_output v;
      #pragma unroll
      for (int j=0; j<vec_size; j++){
      v.val[j] = out0[vec_size * i + j];
      }
      to_0[thread_idx] = v;

      thread_idx += num_threads;
    }
  }

} nvrtc: error: invalid value for --gpu-architecture (-arch)

环境信息

{'sys.platform': 'win11', 'Python': '3.9.16 (main, Mar 8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'CUDA available': True, 'numpy_random_seed': 2147483648, 'GPU 0': 'NVIDIA GeForce RTX 4060 Laptop GPU', 'CUDA_HOME': None, 'GCC': 'n/a', 'PyTorch': '1.12.0+cu113', 'TorchVision': '0.13.0+cu113', 'OpenCV': '4.7.0', 'MMEngine': '0.7.2', 'MMCV': '2.0.0', 'MMPreTrain': '1.0.0rc7+'}

其他信息

No response

Ezra-Yu commented 1 year ago

This does not seem to be a problem with the dataset. The error message is about model initialization

Yu-zhengbo commented 1 year ago

This does not seem to be a problem with the dataset. The error message is about model initialization

Thanks, can you give me some advices?

Yu-zhengbo commented 1 year ago

This does not seem to be a problem with the dataset. The error message is about model initialization

I try to do somethings to solve it, and I find similar question on the github.com/pytorch issues.

Ezra-Yu commented 1 year ago

Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.

Yu-zhengbo commented 1 year ago

Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.

OK, thanks again!

Yu-zhengbo commented 1 year ago

Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.

I get it work when I use pytorch=1.91. And I find the trouble caused by the model I used before(swin_transformer). If I use the resnet or others, I will not get this trouble. Hope it helps you!

Yu-zhengbo commented 1 year ago

Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.

Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.

I get it work when I use pytorch=1.91. And I find the trouble caused by the model I used before(swin_transformer). If I use the resnet or others, I will not get this trouble. Hope it helps you!

At the same time, I use the swin_transformer v2 to train my dataset, it work bad! But I use the resnet, it can work well. So I think that the swin_transformer v2 may have some module need to correct. image image

Ezra-Yu commented 1 year ago

Can you show me your swin-transformer config?

Yu-zhengbo commented 1 year ago

Can you show me your swin-transformer config?

model = dict( type='ImageClassifier', backbone=dict( type='SwinTransformerV2', arch='small', img_size=256, drop_path_rate=0.3, pad_small_map=True), neck=dict(type='GlobalAveragePooling'), head=dict( type='LinearClsHead', num_classes=3, in_channels=768, init_cfg=None, loss=dict( type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), cal_acc=False), init_cfg=[ dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0), dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0) ], train_cfg=dict(augments=[ dict(type='Mixup', alpha=0.8), dict(type='CutMix', alpha=1.0) ])) dataset_type = 'Thyroid' data_preprocessor = dict( num_classes=3, mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='RandomResizedCrop', scale=224), dict(type='RandomFlip', prob=0.5, direction='horizontal'), dict(type='PackInputs') ] test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ] data_root = './data/Thyroid_Cls/' train_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/train.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='RandomResizedCrop', scale=224), dict(type='RandomFlip', prob=0.5, direction='horizontal'), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=True)) val_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/val.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=False)) val_evaluator = dict(type='MyMetric') test_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/val.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=False)) test_evaluator = dict(type='MyMetric') optim_wrapper = dict( optimizer=dict( type='AdamW', lr=0.001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999)), paramwise_cfg=dict( norm_decay_mult=0.0, bias_decay_mult=0.0, flat_decay_mult=0.0, custom_keys=dict({ '.absolute_pos_embed': dict(decay_mult=0.0), '.relative_position_bias_table': dict(decay_mult=0.0) }))) param_scheduler = [ dict( type='LinearLR', start_factor=0.001, by_epoch=True, end=20, convert_to_iter_based=True), dict(type='CosineAnnealingLR', eta_min=1e-05, by_epoch=True, begin=20) ] train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1) val_cfg = dict() test_cfg = dict() auto_scale_lr = dict(base_batch_size=1024) default_scope = 'mmpretrain' default_hooks = dict( timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=100), param_scheduler=dict(type='ParamSchedulerHook'), checkpoint=dict(type='CheckpointHook', interval=30), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='VisualizationHook', enable=False)) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), dist_cfg=dict(backend='nccl')) vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='UniversalVisualizer', vis_backends=[dict(type='LocalVisBackend')]) log_level = 'INFO' load_from = None resume = False randomness = dict(seed=None, deterministic=False) launcher = 'none' work_dir = './work_dirs/1_swinv2-small-w8_16xb64_in1k-256px'

Ezra-Yu commented 1 year ago

It looks like overfitting has occurred? You're adjusting your schedule

Yu-zhengbo commented 1 year ago

Thanks, but I just train about 10 epoch, this phenomenon is occur. And I use the code I write, when I train 200 epoch, acc is about 92%, sepc and sens about 80%.

Yu-zhengbo commented 1 year ago

It looks like overfitting has occurred? You're adjusting your schedule

Same phenomenon happend to ConvNext, you could find the loss do not show the decreasing trend image

Ezra-Yu commented 1 year ago
model = dict(
type='ImageClassifier',
backbone=dict(
type='SwinTransformerV2',
arch='small',
img_size=256,
drop_path_rate=0.3,
pad_small_map=True),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=3,
in_channels=768,
init_cfg=None,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
cal_acc=False),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
],
# train_cfg=dict(augments=[
# dict(type='Mixup', alpha=0.8),
# dict(type='CutMix', alpha=1.0)
# ]
))

maybe you can stop some augs, such as mixup and cutmix. And try a smaller lr.

Yu-zhengbo commented 1 year ago
model = dict(
type='ImageClassifier',
backbone=dict(
type='SwinTransformerV2',
arch='small',
img_size=256,
drop_path_rate=0.3,
pad_small_map=True),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=3,
in_channels=768,
init_cfg=None,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
cal_acc=False),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
],
# train_cfg=dict(augments=[
# dict(type='Mixup', alpha=0.8),
# dict(type='CutMix', alpha=1.0)
# ]
))

maybe you can stop some augs, such as mixup and cutmix. And try a smaller lr.

Ok, thanks! I will go home, and hope you have a fun night!

U-zzd commented 1 year ago

I also encountered the same problem, how to solve this problem

Yu-zhengbo commented 1 year ago

I also encountered the same problem, how to solve this problem

Maybe you can try the another model, the problem will be solve!

U-zzd commented 1 year ago

我试了conformer,swintransformer都会有这个错,但是resnet50可以使用,这是咋回事呢,

Yu-zhengbo commented 1 year ago

我试了conformer,swintransformer都会有这个错,但是resnet50可以使用,这是咋回事呢, 也是这个错误哈”nvrtc: error: invalid value for --gpu-architecture (-arch)“,你用的显卡是不是比较新,应该是部分语法的问题,试试debug定位到报错的位置,然后修改一下。我当时是4090,pytorch1.9还是1.12来着,conformer和swin是可以用的,报错的是segformer

U-zzd commented 1 year ago

我也是4090的显卡,后来把pytorch换成1.13.0重新配环境就可以用了