Open Yu-zhengbo opened 1 year ago
This does not seem to be a problem with the dataset. The error message is about model initialization
This does not seem to be a problem with the dataset. The error message is about model initialization
Thanks, can you give me some advices?
This does not seem to be a problem with the dataset. The error message is about model initialization
I try to do somethings to solve it, and I find similar question on the github.com/pytorch issues.
Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.
Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.
OK, thanks again!
Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.
I get it work when I use pytorch=1.91. And I find the trouble caused by the model I used before(swin_transformer). If I use the resnet or others, I will not get this trouble. Hope it helps you!
Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.
Run an opensource dataset like CUB, refer to https://github.com/open-mmlab/mmpretrain/blob/main/configs/resnet/resnet50_8xb8_cub.py . I think the problem is more likely in the environment.
I get it work when I use pytorch=1.91. And I find the trouble caused by the model I used before(swin_transformer). If I use the resnet or others, I will not get this trouble. Hope it helps you!
At the same time, I use the swin_transformer v2 to train my dataset, it work bad! But I use the resnet, it can work well. So I think that the swin_transformer v2 may have some module need to correct.
Can you show me your swin-transformer config?
Can you show me your swin-transformer config?
model = dict( type='ImageClassifier', backbone=dict( type='SwinTransformerV2', arch='small', img_size=256, drop_path_rate=0.3, pad_small_map=True), neck=dict(type='GlobalAveragePooling'), head=dict( type='LinearClsHead', num_classes=3, in_channels=768, init_cfg=None, loss=dict( type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), cal_acc=False), init_cfg=[ dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0), dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0) ], train_cfg=dict(augments=[ dict(type='Mixup', alpha=0.8), dict(type='CutMix', alpha=1.0) ])) dataset_type = 'Thyroid' data_preprocessor = dict( num_classes=3, mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='RandomResizedCrop', scale=224), dict(type='RandomFlip', prob=0.5, direction='horizontal'), dict(type='PackInputs') ] test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ] data_root = './data/Thyroid_Cls/' train_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/train.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='RandomResizedCrop', scale=224), dict(type='RandomFlip', prob=0.5, direction='horizontal'), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=True)) val_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/val.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=False)) val_evaluator = dict(type='MyMetric') test_dataloader = dict( pin_memory=True, persistent_workers=True, collate_fn=dict(type='default_collate'), batch_size=48, num_workers=5, dataset=dict( type='Thyroid', data_root='./data/Thyroid_Cls/', ann_file='meta/val.txt', data_prefix='', pipeline=[ dict(type='LoadImageFromFile'), dict(type='ResizeEdge', scale=256, edge='short'), dict(type='CenterCrop', crop_size=224), dict(type='PackInputs') ]), sampler=dict(type='DefaultSampler', shuffle=False)) test_evaluator = dict(type='MyMetric') optim_wrapper = dict( optimizer=dict( type='AdamW', lr=0.001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999)), paramwise_cfg=dict( norm_decay_mult=0.0, bias_decay_mult=0.0, flat_decay_mult=0.0, custom_keys=dict({ '.absolute_pos_embed': dict(decay_mult=0.0), '.relative_position_bias_table': dict(decay_mult=0.0) }))) param_scheduler = [ dict( type='LinearLR', start_factor=0.001, by_epoch=True, end=20, convert_to_iter_based=True), dict(type='CosineAnnealingLR', eta_min=1e-05, by_epoch=True, begin=20) ] train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1) val_cfg = dict() test_cfg = dict() auto_scale_lr = dict(base_batch_size=1024) default_scope = 'mmpretrain' default_hooks = dict( timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=100), param_scheduler=dict(type='ParamSchedulerHook'), checkpoint=dict(type='CheckpointHook', interval=30), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='VisualizationHook', enable=False)) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), dist_cfg=dict(backend='nccl')) vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='UniversalVisualizer', vis_backends=[dict(type='LocalVisBackend')]) log_level = 'INFO' load_from = None resume = False randomness = dict(seed=None, deterministic=False) launcher = 'none' work_dir = './work_dirs/1_swinv2-small-w8_16xb64_in1k-256px'
It looks like overfitting has occurred? You're adjusting your schedule
Thanks, but I just train about 10 epoch, this phenomenon is occur. And I use the code I write, when I train 200 epoch, acc is about 92%, sepc and sens about 80%.
It looks like overfitting has occurred? You're adjusting your schedule
Same phenomenon happend to ConvNext, you could find the loss do not show the decreasing trend
model = dict(
type='ImageClassifier',
backbone=dict(
type='SwinTransformerV2',
arch='small',
img_size=256,
drop_path_rate=0.3,
pad_small_map=True),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=3,
in_channels=768,
init_cfg=None,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
cal_acc=False),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
],
# train_cfg=dict(augments=[
# dict(type='Mixup', alpha=0.8),
# dict(type='CutMix', alpha=1.0)
# ]
))
maybe you can stop some augs, such as mixup and cutmix. And try a smaller lr.
model = dict( type='ImageClassifier', backbone=dict( type='SwinTransformerV2', arch='small', img_size=256, drop_path_rate=0.3, pad_small_map=True), neck=dict(type='GlobalAveragePooling'), head=dict( type='LinearClsHead', num_classes=3, in_channels=768, init_cfg=None, loss=dict( type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), cal_acc=False), init_cfg=[ dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0), dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0) ], # train_cfg=dict(augments=[ # dict(type='Mixup', alpha=0.8), # dict(type='CutMix', alpha=1.0) # ] ))
maybe you can stop some augs, such as mixup and cutmix. And try a smaller lr.
Ok, thanks! I will go home, and hope you have a fun night!
I also encountered the same problem, how to solve this problem
I also encountered the same problem, how to solve this problem
Maybe you can try the another model, the problem will be solve!
我试了conformer,swintransformer都会有这个错,但是resnet50可以使用,这是咋回事呢,
我试了conformer,swintransformer都会有这个错,但是resnet50可以使用,这是咋回事呢, 也是这个错误哈”nvrtc: error: invalid value for --gpu-architecture (-arch)“,你用的显卡是不是比较新,应该是部分语法的问题,试试debug定位到报错的位置,然后修改一下。我当时是4090,pytorch1.9还是1.12来着,conformer和swin是可以用的,报错的是segformer
我也是4090的显卡,后来把pytorch换成1.13.0重新配环境就可以用了
分支
main 分支 (mmpretrain 版本)
描述该错误
Traceback (most recent call last): File "D:\cvcode\mmpretrain\tools\train.py", line 159, in
main()
File "D:\cvcode\mmpretrain\tools\train.py", line 155, in main
runner.train()
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\runner\runner.py", line 1692, in train
self._init_model_weights()
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\runner\runner.py", line 899, in _init_model_weights
model.init_weights()
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\base_module.py", line 124, in init_weights
initialize(self, other_cfgs)
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 610, in initialize
_initialize(module, cp_cfg)
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 518, in _initialize
func(module)
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 330, in call
module.apply(init)
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply
module.apply(fn)
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply
module.apply(fn)
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 668, in apply
module.apply(fn)
[Previous line repeated 5 more times]
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 669, in apply
fn(self)
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 327, in init
trunc_normal_init(m, self.mean, self.std, self.a, self.b,
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 79, in trunc_normal_init
truncnormal(module.weight, mean, std, a, b) # type: ignore
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 682, in truncnormal
return _no_grad_truncnormal(tensor, mean, std, a, b)
File "D:\anaconda3\envs\pytorch\lib\site-packages\mmengine\model\weight_init.py", line 649, in _no_grad_truncnormal
tensor.erfinv_()
RuntimeError:
define POS_INFINITY __int_as_float(0x7f800000)
define INFINITY POS_INFINITY
define NEG_INFINITY __int_as_float(0xff800000)
define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t; typedef unsigned int uint32_t; typedef signed char int8_t; typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char" typedef short int16_t; static_assert(sizeof(int64_t) == 8, "expected size does not match"); static_assert(sizeof(uint32_t) == 4, "expected size does not match"); static_assert(sizeof(int8_t) == 1, "expected size does not match"); constexpr int num_threads = 128; constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live constexpr int block_work_size = thread_work_size * num_threads; //TODO use _assert_fail, because assert is disabled in non-debug builds
define ERROR_UNSUPPORTED_CAST assert(false);
namespace std {
using ::signbit; using ::isfinite; using ::isinf; using ::isnan;
using ::abs;
using ::acos; using ::acosf; using ::asin; using ::asinf; using ::atan; using ::atanf; using ::atan2; using ::atan2f; using ::ceil; using ::ceilf; using ::cos; using ::cosf; using ::cosh; using ::coshf;
using ::exp; using ::expf;
using ::fabs; using ::fabsf; using ::floor; using ::floorf;
using ::fmod; using ::fmodf;
using ::frexp; using ::frexpf; using ::ldexp; using ::ldexpf;
using ::log; using ::logf;
using ::log10; using ::log10f; using ::modf; using ::modff;
using ::pow; using ::powf;
using ::sin; using ::sinf; using ::sinh; using ::sinhf;
using ::sqrt; using ::sqrtf; using ::tan; using ::tanf;
using ::tanh; using ::tanhf;
using ::acosh; using ::acoshf; using ::asinh; using ::asinhf; using ::atanh; using ::atanhf; using ::cbrt; using ::cbrtf;
using ::copysign; using ::copysignf;
using ::erf; using ::erff; using ::erfc; using ::erfcf; using ::exp2; using ::exp2f; using ::expm1; using ::expm1f; using ::fdim; using ::fdimf; using ::fmaf; using ::fma; using ::fmax; using ::fmaxf; using ::fmin; using ::fminf; using ::hypot; using ::hypotf; using ::ilogb; using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; using ::log1p; using ::log1pf; using ::log2; using ::log2f; using ::logb; using ::logbf; using ::lrint; using ::lrintf; using ::lround; using ::lroundf;
using ::nan; using ::nanf;
using ::nearbyint; using ::nearbyintf; using ::nextafter; using ::nextafterf; using ::remainder; using ::remainderf; using ::remquo; using ::remquof; using ::rint; using ::rintf; using ::round; using ::roundf; using ::scalbln; using ::scalblnf; using ::scalbn; using ::scalbnf; using ::tgamma; using ::tgammaf; using ::trunc; using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. // Note, some types have ctype as void because we don't support them in codegen
define AT_FORALL_SCALAR_TYPES_WITHCOMPLEX() \
_(uint8t, Byte) / 0 / \ (int8t, Char) / 1 / \ (int16t, Short) / 2 / \ (int, Int) / 3 / \ _(int64t, Long) / 4 / \ (at::Half, Half) / 5 / \ (float, Float) / 6 / \ (double, Double) / 7 / \ (std::complex, ComplexHalf) / 8 / \
(std::complex, ComplexFloat) / 9 / \
(std::complex, ComplexDouble) / 10 / \
(bool, Bool) / 11 / \
(void, QInt8) / 12 / \
(void, QUInt8) / 13 / \
(void, QInt32) / 14 / \
(at::BFloat16, BFloat16) / 15 / \
define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPTQINT() \
_(uint8t, Byte) \ (int8t, Char) \ (int16t, Short) \ (int, Int) \ _(int64t, Long) \ (at::Half, Half) \ (float, Float) \ (double, Double) \ (std::complex, ComplexHalf) \
(std::complex, ComplexFloat) \
(std::complex, ComplexDouble) \
(bool, Bool) \
_(at::BFloat16, BFloat16)
enum class ScalarType : int8_t {
define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
undef DEFINE_ENUM
NumOptions };
template <typename T, int size> struct Array { T data[size];
device T operator[](int i) const { return data[i]; } device T& operator[](int i) { return data[i]; } Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; device Array(T x) { for (int i = 0; i < size; i++) { data[i] = x; } } };
template
device inline scalar_t load(char base_ptr, uint32_t offset) {
return (reinterpret_cast<scalar_t*>(base_ptr) + offset);
}
template
device inline void store(scalar_t value, char base_ptr, uint32_t offset) {
(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
}
// aligned vector generates vectorized load/store on CUDA template<typename scalar_t, int vec_size> struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { scalar_t val[vec_size]; };
template T erfinv_kernel(T a) { return erfinv(a); }
// TODO: setup grid-stride loop
extern "C" global void erfinv_kernel_vectorized4_kernel( const int N, Array<char, 1+1> data, float scalar_val) //[1+1], { constexpr int vec_size = 4; int remaining = N - block_work_size blockIdx.x; auto thread_idx = threadIdx.x; int idx = blockIdx.x; float arg0[4];
//actual loading using vec_t_input = aligned_vector<float, vec_size>; vec_t_input vec0 = reinterpret_cast<vec_t_input >(data[0+1]) + block_work_size / vec_size * idx;
} nvrtc: error: invalid value for --gpu-architecture (-arch)
环境信息
{'sys.platform': 'win11', 'Python': '3.9.16 (main, Mar 8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'CUDA available': True, 'numpy_random_seed': 2147483648, 'GPU 0': 'NVIDIA GeForce RTX 4060 Laptop GPU', 'CUDA_HOME': None, 'GCC': 'n/a', 'PyTorch': '1.12.0+cu113', 'TorchVision': '0.13.0+cu113', 'OpenCV': '4.7.0', 'MMEngine': '0.7.2', 'MMCV': '2.0.0', 'MMPreTrain': '1.0.0rc7+'}
其他信息
No response