The error occurred when running the new pipeline with our own configuration file that ported from the old pipeline's configuration file, matched to the new format. The error message in short is this:
RuntimeError: Given transposed=1, weight of size [512, 512, 2, 2], expected input[2, 1536, 14, 14] to have 512 channels, but got 1536 channels instead
We have encountered similar issue before when first trying to finetune the GFM model with the old 4 bands weights, it was a normalization error from the dataset pipeline/the model itself and got fixed in the old pipeline.
Here is the full error message for the shape mis-match issue:
Traceback (most recent call last):
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/.mim/tools/train.py", line 242, in <module>
main()
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/.mim/tools/train.py", line 231, in main
train_segmentor(
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/apis/train.py", line 194, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 29, in run_iter
outputs = self.model.train_step(data_batch, self.optimizer,
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmcv/parallel/distributed.py", line 59, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/models/segmentors/base.py", line 138, in train_step
losses = self(**data_batch)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 110, in new_func
return old_func(*args, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/models/segmentors/base.py", line 108, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/models/segmentors/encoder_decoder.py", line 140, in forward_train
x = self.extract_feat(img)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/models/segmentors/encoder_decoder.py", line 68, in extract_feat
x = self.neck(x)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/geospatial_fm/geospatial_fm.py", line 269, in forward
x = self.fpn1(x)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 927, in forward
return F.conv_transpose2d(
RuntimeError: Given transposed=1, weight of size [512, 512, 2, 2], expected input[2, 1536, 14, 14] to have 512 channels, but got 1536 channels instead
Traceback (most recent call last):
File "/opt/conda/envs/hls_test/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/envs/hls_test/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/distributed/launch.py", line 260, in <module>
main()
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/torch/distributed/launch.py", line 255, in main
raise subprocess.CalledProcessError(returncode=process.returncode,
subprocess.CalledProcessError: Command '['/opt/conda/envs/hls_test/bin/python3.9', '-u', '/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/.mim/tools/train.py', '--local_rank=0', 'configs/cropclassification_config.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
Traceback (most recent call last):
File "/opt/conda/envs/hls_test/bin/mim", line 8, in <module>
sys.exit(cli())
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mim/commands/train.py", line 100, in cli
is_success, msg = train(
File "/opt/conda/envs/hls_test/lib/python3.9/site-packages/mim/commands/train.py", line 261, in train
ret = subprocess.check_call(
File "/opt/conda/envs/hls_test/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/opt/conda/envs/hls_test/bin/python3.9', '-m', 'torch.distributed.launch', '--nproc_per_node=1', '--master_port=25705', '/opt/conda/envs/hls_test/lib/python3.9/site-packages/mmseg/.mim/tools/train.py', 'configs/cropclassification_config.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
The error occurred when running the new pipeline with our own configuration file that ported from the old pipeline's configuration file, matched to the new format. The error message in short is this:
RuntimeError: Given transposed=1, weight of size [512, 512, 2, 2], expected input[2, 1536, 14, 14] to have 512 channels, but got 1536 channels instead
We have encountered similar issue before when first trying to finetune the GFM model with the old 4 bands weights, it was a normalization error from the dataset pipeline/the model itself and got fixed in the old pipeline.
Here is the full error message for the shape mis-match issue: