Open billbliss3 opened 9 months ago
If I change the config to 'internimage_t_1k_224'. It works fine.
type='InternImage',
core_op='DCNv3',
channels=64,
depths=[4, 4, 18, 4],
groups=[4, 8, 16, 32],
mlp_ratio=4.,
drop_path_rate=0.2,
norm_layer='LN',
layer_scale=1.0,
offset_scale=1.0,
post_norm=False,
with_cp=False,
out_indices=(2, 3),
use_dcn_v4_op=True,
init_cfg=dict(type='Pretrained', checkpoint='ckpts/internimage_t_1k_224.pth')
DCNv4 version 1.0.0.post2 by pip.
pip install DCNv4==1.0.0.post2
config file as follows:
but get error log as follows:
File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 138, in run iter_runner(iter_loaders[i], kwargs) File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 68, in train self.call_hook('after_train_iter') File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 309, in call_hook getattr(hook, fn_name)(self) File "/opt/conda/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py", line 56, in after_train_iter runner.outputs['loss'].backward() File "/opt/conda/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward Variable._execution_engine.run_backward( File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply return user_fn(self, args) File "/opt/conda/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 138, in backward torch.autograd.backward(outputs_with_grad, args_with_grad) File "/opt/conda/lib/python3.8/site-packages/torch/autograd/init.py", line 154, in backward Variable._execution_engine.run_backward( File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 199, in apply return user_fn(self, args) File "/opt/conda/lib/python3.8/site-packages/torch/autograd/function.py", line 340, in wrapper outputs = fn(ctx, args) File "/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 111, in decorate_bwd return bwd(args, kwargs) File "/opt/conda/lib/python3.8/site-packages/DCNv4/functions/dcnv4_func.py", line 125, in backward ext.dcnv4backward(*args) RuntimeError: falseINTERNAL ASSERT FAILED at "/tmp/pip-install-3xkvdwi/dcnv4_443530a10fe9416eb8a6d1a2a10d577b/src/cuda/dcnv4_col2im_cuda.cuh":470, please report a bug to PyTorch. kernel launch error ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 847) of binary: /opt/conda/bin/python Traceback (most recent call last): File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: