Closed algoteam5 closed 3 years ago
@zhengye1995 I know they are inherited. What I am asking is that what is the use of each one? i.e. why did you use zero_shot_mask_rcnn
instead of zero_shot_faster_rcnn
in the config?
zero_shot_faster_rcnn is used for ZSD task and zero_shot_mask_rcnn is used for ZSI task. The biggest difference is whether they include the mask branch, see this code.
By the way, this is just my personal implementation habit. You can also add the mask branch into zero_shot_faster_rcnn to merge the two implementations into one.
Could you please give me the config file where ZSD is run only? so that I can understand how was your work integrated inside the mmdetection
framework. I tried like this but got errors
# model settings
model = dict(
type='ZeroShotFasterRCNN',
pretrained='torchvision://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='BackgroundAwareRPNHead',
in_channels=256,
semantic_dims=300,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
voc_path=None,
vec_path='data/coco/word_w2v_withbg_65_15.txt',
sync_bg=True,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCSemanticBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=66,
semantic_dims=300,
seen_class=True,
reg_with_semantic=False,
share_semantic=False,
with_decoder=True,
sync_bg=True,
voc_path='data/coco/vocabulary_w2v.txt',
vec_path='data/coco/word_w2v_withbg_65_15.txt',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
loss_semantic=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
)
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100,
mask_thr_binary=0.5))
# dataset settings
dataset_type = 'CocoDatasetUnseen15'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2014_seen_65_15.json',
img_prefix=data_root + 'train2014/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2014_seen_65_15.json',
img_prefix=data_root + 'val2014/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2014_unseen_65_15.json',
img_prefix=data_root + 'val2014/',
pipeline=test_pipeline))
# optimizer
optimizer = dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=7000,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=12)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
evaluation = dict(interval=1)
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/zsi/baseline/65_15/'
load_from = None
resume_from = None
workflow = [('train', 1)]
Is it convenient for you to provide detailed information about the error?
Not at all : ), youre very polite! here is the error:
Traceback (most recent call last):
File "./tools/train.py", line 119, in <module>
main()
File "./tools/train.py", line 115, in main
logger=logger)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/apis/train.py", line 58, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/apis/train.py", line 192, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/home/usdd1/anaconda3/envs/zsi2_cu11/lib/python3.7/site-packages/mmcv/runner/runner.py", line 358, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/usdd1/anaconda3/envs/zsi2_cu11/lib/python3.7/site-packages/mmcv/runner/runner.py", line 264, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/apis/train.py", line 38, in batch_processor
losses = model(**data)
File "/home/usdd1/anaconda3/envs/zsi2_cu11/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/usdd1/anaconda3/envs/zsi2_cu11/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 50, in forward
return self.module(*inputs[0], **kwargs[0])
File "/home/usdd1/anaconda3/envs/zsi2_cu11/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/core/fp16/decorators.py", line 49, in new_func
return old_func(*args, **kwargs)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/models/detectors/base.py", line 117, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/home/usdd1/yahee/instance_seg/ZSI2_cu11/mmdet/models/detectors/zero_shot_two_stage.py", line 193, in forward_train
*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
TypeError: loss() got multiple values for argument 'gt_bboxes_ignore'
Try this one:
# model settings
model = dict(
type='ZeroShotFasterRCNN',
pretrained='torchvision://resnet101',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='BackgroundAwareRPNHead',
in_channels=256,
semantic_dims=300,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
voc_path=None,
vec_path='data/coco/word_w2v_withbg_65_15.txt',
sync_bg=True,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='SharedFCSemanticBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=66,
semantic_dims=300,
seen_class=True,
reg_with_semantic=False,
share_semantic=False,
with_decoder=True,
sync_bg=True,
voc_path='data/coco/vocabulary_w2v.txt',
vec_path='data/coco/word_w2v_withbg_65_15.txt',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False,
loss_semantic=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
bbox_with_decoder=True,
bbox_sync_bg=True)
# model training and testing settings
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False))
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100))
# dataset settings
dataset_type = 'CocoDatasetUnseen15'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2014_seen_65_15.json',
img_prefix=data_root + 'train2014/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2014_seen_65_15.json',
img_prefix=data_root + 'val2014/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2014_unseen_65_15.json',
img_prefix=data_root + 'val2014/',
pipeline=test_pipeline))
# optimizer
optimizer = dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=7000,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=12)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
evaluation = dict(interval=1)
# runtime settings
total_epochs = 12
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/zsi/baseline/65_15/'
load_from = None
resume_from = None
workflow = [('train', 1)]
Thank you!
I am confusing about your work.
bbox_head_semantic
andconvfc_bbox_semantic_head
?ba_anchor_head
andanchor_semantic_head
?zero_shot_faster_rcnn
,zero_shot_mask_rcnn
andzero_shot_two_stage
?