Closed kkkcx closed 7 months ago
I haven't tested the performance of Eva-Large on the validation data, but in our paper, we compare the performance of Eva-Large and ResNet101 on the test dataset, as shown in the figure below.
Due to company restrictions, I am unable to upload the config code. You can refer to another repository.
Thank you so much for the detailed response!!
I apologize for troubling you again. I have modified the config file to accommodate training with EVA02, but I am encountering an error that I cannot resolve. Could I kindly request your guidance on how to make the necessary adjustments? Thank you very much for your assistance.
config file:
# ================ base config ===================
plugin = True
plugin_dir = "projects/mmdet3d_plugin/"
dist_params = dict(backend="nccl")
log_level = "INFO"
work_dir = None
total_batch_size = 48
num_gpus = 8
batch_size = total_batch_size // num_gpus
num_iters_per_epoch = int(28130 // (num_gpus * batch_size))
num_epochs = 100
checkpoint_epoch_interval = 20
checkpoint_config = dict(
interval=num_iters_per_epoch * checkpoint_epoch_interval
)
log_config = dict(
interval=51,
hooks=[
dict(type="TextLoggerHook", by_epoch=False),
dict(type="TensorboardLoggerHook"),
],
)
load_from = None
resume_from = None
workflow = [("train", 1)]
fp16 = dict(loss_scale=32.0)
input_shape = (704, 256)
tracking_test = True
tracking_threshold = 0.2
# ================== model ========================
class_names = [
"car",
"truck",
"construction_vehicle",
"bus",
"trailer",
"barrier",
"motorcycle",
"bicycle",
"pedestrian",
"traffic_cone",
]
num_classes = len(class_names)
embed_dims = 256
num_groups = 8
num_decoder = 6
num_single_frame_decoder = 1
use_deformable_func = True # mmdet3d_plugin/ops/setup.py needs to be executed
strides = [4, 8, 16, 32]
num_levels = len(strides)
num_depth_layers = 3
drop_out = 0.1
temporal = True
decouple_attn = True
with_quality_estimation = True
sim_fpn=dict(
scale_factors=[4, 2, 1, 0.5],
in_channels=1024,
out_channels=256,
out_indices=[2, 3, 4, 5],
)
model = dict(
type="Sparse4D",
use_grid_mask=True,
use_deformable_func=use_deformable_func,
# img_backbone=dict(
# type="ResNet",
# depth=50,
# num_stages=4,
# frozen_stages=-1,
# norm_eval=False,
# style="pytorch",
# with_cp=True,
# out_indices=(0, 1, 2, 3),
# norm_cfg=dict(type="BN", requires_grad=True),
# pretrained="ckpt/resnet50-19c8e357.pth",
# ),
img_backbone=dict(
type='EVAViT',
img_size=320, # img_size for short side
patch_size=16,
window_size=16,
global_window_size=20,
# If use square image (e.g., set global_window_size=0, else global_window_size=img_size // 16)
in_chans=3,
embed_dim=1024,
depth=24,
num_heads=16,
mlp_ratio=4 * 2 / 3,
window_block_indexes=(
list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(
range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23))
),
sim_fpn=sim_fpn,
qkv_bias=True,
drop_path_rate=0.3,
with_cp=True,
flash_attn=True,
),
img_neck=dict(
type="FPN",
num_outs=num_levels,
start_level=0,
out_channels=embed_dims,
add_extra_convs="on_output",
relu_before_extra_convs=True,
in_channels=[256, 512, 1024, 2048],
),
depth_branch=dict( # for auxiliary supervision only
type="DenseDepthNet",
embed_dims=embed_dims,
num_depth_layers=num_depth_layers,
loss_weight=0.2,
),
head=dict(
type="Sparse4DHead",
cls_threshold_to_reg=0.05,
decouple_attn=decouple_attn,
instance_bank=dict(
type="InstanceBank",
num_anchor=900,
embed_dims=embed_dims,
anchor="nuscenes_kmeans900.npy",
anchor_handler=dict(type="SparseBox3DKeyPointsGenerator"),
num_temp_instances=600 if temporal else -1,
confidence_decay=0.6,
feat_grad=False,
),
anchor_encoder=dict(
type="SparseBox3DEncoder",
vel_dims=3,
embed_dims=[128, 32, 32, 64] if decouple_attn else 256,
mode="cat" if decouple_attn else "add",
output_fc=not decouple_attn,
in_loops=1,
out_loops=4 if decouple_attn else 2,
),
num_single_frame_decoder=num_single_frame_decoder,
operation_order=(
[
"gnn",
"norm",
"deformable",
"ffn",
"norm",
"refine",
]
* num_single_frame_decoder
+ [
"temp_gnn",
"gnn",
"norm",
"deformable",
"ffn",
"norm",
"refine",
]
* (num_decoder - num_single_frame_decoder)
)[2:],
temp_graph_model=dict(
type="MultiheadAttention",
embed_dims=embed_dims if not decouple_attn else embed_dims * 2,
num_heads=num_groups,
batch_first=True,
dropout=drop_out,
)
if temporal
else None,
graph_model=dict(
type="MultiheadAttention",
embed_dims=embed_dims if not decouple_attn else embed_dims * 2,
num_heads=num_groups,
batch_first=True,
dropout=drop_out,
),
norm_layer=dict(type="LN", normalized_shape=embed_dims),
ffn=dict(
type="AsymmetricFFN",
in_channels=embed_dims * 2,
pre_norm=dict(type="LN"),
embed_dims=embed_dims,
feedforward_channels=embed_dims * 4,
num_fcs=2,
ffn_drop=drop_out,
act_cfg=dict(type="ReLU", inplace=True),
),
deformable_model=dict(
type="DeformableFeatureAggregation",
embed_dims=embed_dims,
num_groups=num_groups,
num_levels=num_levels,
num_cams=6,
attn_drop=0.15,
use_deformable_func=use_deformable_func,
use_camera_embed=True,
residual_mode="cat",
kps_generator=dict(
type="SparseBox3DKeyPointsGenerator",
num_learnable_pts=6,
fix_scale=[
[0, 0, 0],
[0.45, 0, 0],
[-0.45, 0, 0],
[0, 0.45, 0],
[0, -0.45, 0],
[0, 0, 0.45],
[0, 0, -0.45],
],
),
),
refine_layer=dict(
type="SparseBox3DRefinementModule",
embed_dims=embed_dims,
num_cls=num_classes,
refine_yaw=True,
with_quality_estimation=with_quality_estimation,
),
sampler=dict(
type="SparseBox3DTarget",
num_dn_groups=5,
num_temp_dn_groups=3,
dn_noise_scale=[2.0] * 3 + [0.5] * 7,
max_dn_gt=32,
add_neg_dn=True,
cls_weight=2.0,
box_weight=0.25,
reg_weights=[2.0] * 3 + [0.5] * 3 + [0.0] * 4,
cls_wise_reg_weights={
class_names.index("traffic_cone"): [
2.0,
2.0,
2.0,
1.0,
1.0,
1.0,
0.0,
0.0,
1.0,
1.0,
],
},
),
loss_cls=dict(
type="FocalLoss",
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0,
),
loss_reg=dict(
type="SparseBox3DLoss",
loss_box=dict(type="L1Loss", loss_weight=0.25),
loss_centerness=dict(type="CrossEntropyLoss", use_sigmoid=True),
loss_yawness=dict(type="GaussianFocalLoss"),
cls_allow_reverse=[class_names.index("barrier")],
),
decoder=dict(type="SparseBox3DDecoder"),
reg_weights=[2.0] * 3 + [1.0] * 7,
),
)
# ================== data ========================
dataset_type = "NuScenes3DDetTrackDataset"
data_root = "data/nuscenes/"
anno_root = "data/nuscenes_cam/"
anno_root = "data/nuscenes_anno_pkls/"
file_client_args = dict(backend="disk")
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True
)
train_pipeline = [
dict(type="LoadMultiViewImageFromFiles", to_float32=True),
dict(
type="LoadPointsFromFile",
coord_type="LIDAR",
load_dim=5,
use_dim=5,
file_client_args=file_client_args,
),
dict(type="ResizeCropFlipImage"),
dict(
type="MultiScaleDepthMapGenerator",
downsample=strides[:num_depth_layers],
),
dict(type="BBoxRotation"),
dict(type="PhotoMetricDistortionMultiViewImage"),
dict(type="NormalizeMultiviewImage", **img_norm_cfg),
dict(
type="CircleObjectRangeFilter",
class_dist_thred=[55] * len(class_names),
),
dict(type="InstanceNameFilter", classes=class_names),
dict(type="NuScenesSparse4DAdaptor"),
dict(
type="Collect",
keys=[
"img",
"timestamp",
"projection_mat",
"image_wh",
"gt_depth",
"focal",
"gt_bboxes_3d",
"gt_labels_3d",
],
meta_keys=["T_global", "T_global_inv", "timestamp", "instance_id"],
),
]
test_pipeline = [
dict(type="LoadMultiViewImageFromFiles", to_float32=True),
dict(type="ResizeCropFlipImage"),
dict(type="NormalizeMultiviewImage", **img_norm_cfg),
dict(type="NuScenesSparse4DAdaptor"),
dict(
type="Collect",
keys=[
"img",
"timestamp",
"projection_mat",
"image_wh",
],
meta_keys=["T_global", "T_global_inv", "timestamp"],
),
]
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False,
)
data_basic_config = dict(
type=dataset_type,
data_root=data_root,
classes=class_names,
modality=input_modality,
version="v1.0-trainval",
)
data_aug_conf = {
"resize_lim": (0.40, 0.47),
"final_dim": input_shape[::-1],
"bot_pct_lim": (0.0, 0.0),
"rot_lim": (-5.4, 5.4),
"H": 900,
"W": 1600,
"rand_flip": True,
"rot3d_range": [-0.3925, 0.3925],
}
data = dict(
samples_per_gpu=batch_size,
workers_per_gpu=batch_size,
train=dict(
**data_basic_config,
ann_file=anno_root + "nuscenes_infos_train.pkl",
pipeline=train_pipeline,
test_mode=False,
data_aug_conf=data_aug_conf,
with_seq_flag=True,
sequences_split_num=2,
keep_consistent_seq_aug=True,
),
val=dict(
**data_basic_config,
ann_file=anno_root + "nuscenes_infos_val.pkl",
pipeline=test_pipeline,
data_aug_conf=data_aug_conf,
test_mode=True,
tracking=tracking_test,
tracking_threshold=tracking_threshold,
),
test=dict(
**data_basic_config,
ann_file=anno_root + "nuscenes_infos_val.pkl",
pipeline=test_pipeline,
data_aug_conf=data_aug_conf,
test_mode=True,
tracking=tracking_test,
tracking_threshold=tracking_threshold,
),
)
# ================== training ========================
optimizer = dict(
type="AdamW",
lr=6e-4,
weight_decay=0.001,
paramwise_cfg=dict(
custom_keys={
"img_backbone": dict(lr_mult=0.5),
}
),
)
optimizer_config = dict(grad_clip=dict(max_norm=25, norm_type=2))
lr_config = dict(
policy="CosineAnnealing",
warmup="linear",
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3,
)
runner = dict(
type="IterBasedRunner",
max_iters=num_iters_per_epoch * num_epochs,
)
# ================== eval ========================
vis_pipeline = [
dict(type="LoadMultiViewImageFromFiles", to_float32=True),
dict(
type="Collect",
keys=["img"],
meta_keys=["timestamp", "lidar2img"],
),
]
evaluation = dict(
interval=num_iters_per_epoch * checkpoint_epoch_interval,
pipeline=vis_pipeline,
# out_dir="./vis", # for visualization
)
the error goes like:
/Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 95, in forward
return self.forward_train(img, **data)
/Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 100, in forward_train
feature_maps, depths = self.extract_feat(img, True, data)
/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 146, in new_func
output = old_func(*new_args, **new_kwargs)
/Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 77, in extract_feat
feature_maps = list(self.img_neck(feature_maps))
/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 146, in new_func
output = old_func(*new_args, **new_kwargs)
/python3.8/site-packages/mmdet/models/necks/fpn.py", line 157, in forward
laterals = [
/python3.8/site-packages/mmdet/models/necks/fpn.py", line 158, in <listcomp>
lateral_conv(inputs[i + self.start_level])
/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
/python3.8/site-packages/mmcv/cnn/bricks/conv_module.py", line 207, in forward
x = self.conv(x)
/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
/python3.8/site-packages/torch/nn/modules/conv.py", line 457, in forward
return self._conv_forward(input, self.weight, self.bias)
/python3.8/site-packages/torch/nn/modules/conv.py", line 453, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Given groups=1, weight of size [256, 512, 1, 1], expected input[36, 256, 32, 88] to have 512 channels, but got 256 channels instead
For EVA02, we don't use neck, such as FPN. Just use sim_fpn
to get multi-scale feature maps.
I apologize for troubling you again. I have modified the config file to accommodate training with EVA02, but I am encountering an error that I cannot resolve. Could I kindly request your guidance on how to make the necessary adjustments? Thank you very much for your assistance.
config file:
# ================ base config =================== plugin = True plugin_dir = "projects/mmdet3d_plugin/" dist_params = dict(backend="nccl") log_level = "INFO" work_dir = None total_batch_size = 48 num_gpus = 8 batch_size = total_batch_size // num_gpus num_iters_per_epoch = int(28130 // (num_gpus * batch_size)) num_epochs = 100 checkpoint_epoch_interval = 20 checkpoint_config = dict( interval=num_iters_per_epoch * checkpoint_epoch_interval ) log_config = dict( interval=51, hooks=[ dict(type="TextLoggerHook", by_epoch=False), dict(type="TensorboardLoggerHook"), ], ) load_from = None resume_from = None workflow = [("train", 1)] fp16 = dict(loss_scale=32.0) input_shape = (704, 256) tracking_test = True tracking_threshold = 0.2 # ================== model ======================== class_names = [ "car", "truck", "construction_vehicle", "bus", "trailer", "barrier", "motorcycle", "bicycle", "pedestrian", "traffic_cone", ] num_classes = len(class_names) embed_dims = 256 num_groups = 8 num_decoder = 6 num_single_frame_decoder = 1 use_deformable_func = True # mmdet3d_plugin/ops/setup.py needs to be executed strides = [4, 8, 16, 32] num_levels = len(strides) num_depth_layers = 3 drop_out = 0.1 temporal = True decouple_attn = True with_quality_estimation = True sim_fpn=dict( scale_factors=[4, 2, 1, 0.5], in_channels=1024, out_channels=256, out_indices=[2, 3, 4, 5], ) model = dict( type="Sparse4D", use_grid_mask=True, use_deformable_func=use_deformable_func, # img_backbone=dict( # type="ResNet", # depth=50, # num_stages=4, # frozen_stages=-1, # norm_eval=False, # style="pytorch", # with_cp=True, # out_indices=(0, 1, 2, 3), # norm_cfg=dict(type="BN", requires_grad=True), # pretrained="ckpt/resnet50-19c8e357.pth", # ), img_backbone=dict( type='EVAViT', img_size=320, # img_size for short side patch_size=16, window_size=16, global_window_size=20, # If use square image (e.g., set global_window_size=0, else global_window_size=img_size // 16) in_chans=3, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4 * 2 / 3, window_block_indexes=( list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list( range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) ), sim_fpn=sim_fpn, qkv_bias=True, drop_path_rate=0.3, with_cp=True, flash_attn=True, ), img_neck=dict( type="FPN", num_outs=num_levels, start_level=0, out_channels=embed_dims, add_extra_convs="on_output", relu_before_extra_convs=True, in_channels=[256, 512, 1024, 2048], ), depth_branch=dict( # for auxiliary supervision only type="DenseDepthNet", embed_dims=embed_dims, num_depth_layers=num_depth_layers, loss_weight=0.2, ), head=dict( type="Sparse4DHead", cls_threshold_to_reg=0.05, decouple_attn=decouple_attn, instance_bank=dict( type="InstanceBank", num_anchor=900, embed_dims=embed_dims, anchor="nuscenes_kmeans900.npy", anchor_handler=dict(type="SparseBox3DKeyPointsGenerator"), num_temp_instances=600 if temporal else -1, confidence_decay=0.6, feat_grad=False, ), anchor_encoder=dict( type="SparseBox3DEncoder", vel_dims=3, embed_dims=[128, 32, 32, 64] if decouple_attn else 256, mode="cat" if decouple_attn else "add", output_fc=not decouple_attn, in_loops=1, out_loops=4 if decouple_attn else 2, ), num_single_frame_decoder=num_single_frame_decoder, operation_order=( [ "gnn", "norm", "deformable", "ffn", "norm", "refine", ] * num_single_frame_decoder + [ "temp_gnn", "gnn", "norm", "deformable", "ffn", "norm", "refine", ] * (num_decoder - num_single_frame_decoder) )[2:], temp_graph_model=dict( type="MultiheadAttention", embed_dims=embed_dims if not decouple_attn else embed_dims * 2, num_heads=num_groups, batch_first=True, dropout=drop_out, ) if temporal else None, graph_model=dict( type="MultiheadAttention", embed_dims=embed_dims if not decouple_attn else embed_dims * 2, num_heads=num_groups, batch_first=True, dropout=drop_out, ), norm_layer=dict(type="LN", normalized_shape=embed_dims), ffn=dict( type="AsymmetricFFN", in_channels=embed_dims * 2, pre_norm=dict(type="LN"), embed_dims=embed_dims, feedforward_channels=embed_dims * 4, num_fcs=2, ffn_drop=drop_out, act_cfg=dict(type="ReLU", inplace=True), ), deformable_model=dict( type="DeformableFeatureAggregation", embed_dims=embed_dims, num_groups=num_groups, num_levels=num_levels, num_cams=6, attn_drop=0.15, use_deformable_func=use_deformable_func, use_camera_embed=True, residual_mode="cat", kps_generator=dict( type="SparseBox3DKeyPointsGenerator", num_learnable_pts=6, fix_scale=[ [0, 0, 0], [0.45, 0, 0], [-0.45, 0, 0], [0, 0.45, 0], [0, -0.45, 0], [0, 0, 0.45], [0, 0, -0.45], ], ), ), refine_layer=dict( type="SparseBox3DRefinementModule", embed_dims=embed_dims, num_cls=num_classes, refine_yaw=True, with_quality_estimation=with_quality_estimation, ), sampler=dict( type="SparseBox3DTarget", num_dn_groups=5, num_temp_dn_groups=3, dn_noise_scale=[2.0] * 3 + [0.5] * 7, max_dn_gt=32, add_neg_dn=True, cls_weight=2.0, box_weight=0.25, reg_weights=[2.0] * 3 + [0.5] * 3 + [0.0] * 4, cls_wise_reg_weights={ class_names.index("traffic_cone"): [ 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, ], }, ), loss_cls=dict( type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0, ), loss_reg=dict( type="SparseBox3DLoss", loss_box=dict(type="L1Loss", loss_weight=0.25), loss_centerness=dict(type="CrossEntropyLoss", use_sigmoid=True), loss_yawness=dict(type="GaussianFocalLoss"), cls_allow_reverse=[class_names.index("barrier")], ), decoder=dict(type="SparseBox3DDecoder"), reg_weights=[2.0] * 3 + [1.0] * 7, ), ) # ================== data ======================== dataset_type = "NuScenes3DDetTrackDataset" data_root = "data/nuscenes/" anno_root = "data/nuscenes_cam/" anno_root = "data/nuscenes_anno_pkls/" file_client_args = dict(backend="disk") img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True ) train_pipeline = [ dict(type="LoadMultiViewImageFromFiles", to_float32=True), dict( type="LoadPointsFromFile", coord_type="LIDAR", load_dim=5, use_dim=5, file_client_args=file_client_args, ), dict(type="ResizeCropFlipImage"), dict( type="MultiScaleDepthMapGenerator", downsample=strides[:num_depth_layers], ), dict(type="BBoxRotation"), dict(type="PhotoMetricDistortionMultiViewImage"), dict(type="NormalizeMultiviewImage", **img_norm_cfg), dict( type="CircleObjectRangeFilter", class_dist_thred=[55] * len(class_names), ), dict(type="InstanceNameFilter", classes=class_names), dict(type="NuScenesSparse4DAdaptor"), dict( type="Collect", keys=[ "img", "timestamp", "projection_mat", "image_wh", "gt_depth", "focal", "gt_bboxes_3d", "gt_labels_3d", ], meta_keys=["T_global", "T_global_inv", "timestamp", "instance_id"], ), ] test_pipeline = [ dict(type="LoadMultiViewImageFromFiles", to_float32=True), dict(type="ResizeCropFlipImage"), dict(type="NormalizeMultiviewImage", **img_norm_cfg), dict(type="NuScenesSparse4DAdaptor"), dict( type="Collect", keys=[ "img", "timestamp", "projection_mat", "image_wh", ], meta_keys=["T_global", "T_global_inv", "timestamp"], ), ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, ) data_basic_config = dict( type=dataset_type, data_root=data_root, classes=class_names, modality=input_modality, version="v1.0-trainval", ) data_aug_conf = { "resize_lim": (0.40, 0.47), "final_dim": input_shape[::-1], "bot_pct_lim": (0.0, 0.0), "rot_lim": (-5.4, 5.4), "H": 900, "W": 1600, "rand_flip": True, "rot3d_range": [-0.3925, 0.3925], } data = dict( samples_per_gpu=batch_size, workers_per_gpu=batch_size, train=dict( **data_basic_config, ann_file=anno_root + "nuscenes_infos_train.pkl", pipeline=train_pipeline, test_mode=False, data_aug_conf=data_aug_conf, with_seq_flag=True, sequences_split_num=2, keep_consistent_seq_aug=True, ), val=dict( **data_basic_config, ann_file=anno_root + "nuscenes_infos_val.pkl", pipeline=test_pipeline, data_aug_conf=data_aug_conf, test_mode=True, tracking=tracking_test, tracking_threshold=tracking_threshold, ), test=dict( **data_basic_config, ann_file=anno_root + "nuscenes_infos_val.pkl", pipeline=test_pipeline, data_aug_conf=data_aug_conf, test_mode=True, tracking=tracking_test, tracking_threshold=tracking_threshold, ), ) # ================== training ======================== optimizer = dict( type="AdamW", lr=6e-4, weight_decay=0.001, paramwise_cfg=dict( custom_keys={ "img_backbone": dict(lr_mult=0.5), } ), ) optimizer_config = dict(grad_clip=dict(max_norm=25, norm_type=2)) lr_config = dict( policy="CosineAnnealing", warmup="linear", warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict( type="IterBasedRunner", max_iters=num_iters_per_epoch * num_epochs, ) # ================== eval ======================== vis_pipeline = [ dict(type="LoadMultiViewImageFromFiles", to_float32=True), dict( type="Collect", keys=["img"], meta_keys=["timestamp", "lidar2img"], ), ] evaluation = dict( interval=num_iters_per_epoch * checkpoint_epoch_interval, pipeline=vis_pipeline, # out_dir="./vis", # for visualization )
the error goes like:
/Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 95, in forward return self.forward_train(img, **data) /Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 100, in forward_train feature_maps, depths = self.extract_feat(img, True, data) /python3.8/site-packages/mmcv/runner/fp16_utils.py", line 146, in new_func output = old_func(*new_args, **new_kwargs) /Sparse4D/projects/mmdet3d_plugin/models/sparse4d.py", line 77, in extract_feat feature_maps = list(self.img_neck(feature_maps)) /python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) /python3.8/site-packages/mmcv/runner/fp16_utils.py", line 146, in new_func output = old_func(*new_args, **new_kwargs) /python3.8/site-packages/mmdet/models/necks/fpn.py", line 157, in forward laterals = [ /python3.8/site-packages/mmdet/models/necks/fpn.py", line 158, in <listcomp> lateral_conv(inputs[i + self.start_level]) /python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) /python3.8/site-packages/mmcv/cnn/bricks/conv_module.py", line 207, in forward x = self.conv(x) /python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) /python3.8/site-packages/torch/nn/modules/conv.py", line 457, in forward return self._conv_forward(input, self.weight, self.bias) /python3.8/site-packages/torch/nn/modules/conv.py", line 453, in _conv_forward return F.conv2d(input, weight, bias, self.stride, RuntimeError: Given groups=1, weight of size [256, 512, 1, 1], expected input[36, 256, 32, 88] to have 512 channels, but got 256 channels instead
Same needs. Have you successfully completed the EVA configuration? I would greatly appreciate it if you could share your configuration.
Hello Dr. Lin,
I was reviewing the data you provided and noticed that in the 'Results on Validation Split' section, you used Res101 to achieve the best results, whereas in the 'Results on Test Split', you switched to using EVA02-large. Could you please tell me if using EVA02-large would yield better results than Res101 specifically for the validation results?
Also, I would greatly appreciate it if you could share the training details for models using EVA02-large as a backbone, or briefly explain how to modify the config file sparse4dv3_temporal_r50_1x8_bs6_256x704.py to accommodate EVA02-large?
Thank you so much!!