IDEA-Research / DINO

[ICLR 2023] Official implementation of the paper "DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection"
Apache License 2.0
2.15k stars 232 forks source link

When I finish training on a custom dataset, I have a problem using the trained model for inference #127

Closed ahxiaofengzheng closed 1 year ago

ahxiaofengzheng commented 1 year ago

This is my command to train the model: python main.py \ --output_dir logs/DINO/R50-MS4_enhance -c ./config/DINO/DINO_4scale_swin.py --coco_path ./Data/UTDAC2020_enhance \ --finetune_ignore label_enc.weight class_embed \ --pretrain_model_path ./ckpts/checkpoint0011_4scale_swin.pth \ --options dn_scalar=100 embed_init_tgt=TRUE \ dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \ dn_box_noise_scale=1.0 backbone_dir=./pretrained_backbone problem:_IncompatibleKeys(missing_keys=['norm1.weight', 'norm1.bias', 'norm2.weight', 'norm2.bias', 'norm3.weight', 'norm3.bias'], unexpected_keys=['norm.weight', 'norm.bias', 'layers.0.blocks.1.attn_mask', 'layers.1.blocks.1.attn_mask', 'layers.2.blocks.1.attn_mask', 'layers.2.blocks.3.attn_mask', 'layers.2.blocks.5.attn_mask', 'layers.2.blocks.7.attn_mask', 'layers.2.blocks.9.attn_mask', 'layers.2.blocks.11.attn_mask', 'layers.2.blocks.13.attn_mask', 'layers.2.blocks.15.attn_mask', 'layers.2.blocks.17.attn_mask'])

HaoZhang534 commented 1 year ago

Can you provide your log when inference? This error seems to happen when loading swin-l backbone with strict mode.

ahxiaofengzheng commented 1 year ago

My inference code was modified on inference_and_visualization.ipynb

This is the log when inference: Config (path: ./config/DINO/DINO_4scale_swin.py): {'data_aug_scales': [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800], 'data_aug_max_size': 1333, 'data_aug_scales2_resize': [400, 500, 600], 'data_aug_scales2_crop': [384, 600], 'data_aug_scale_overlap': None, 'num_classes': 5, 'lr': 0.0001, 'param_dict_type': 'default', 'lr_backbone': 1e-05, 'lr_backbone_names': ['backbone.0'], 'lr_linear_proj_names': ['reference_points', 'sampling_offsets'], 'lr_linear_proj_mult': 0.1, 'ddetr_lr_param': False, 'batch_size': 2, 'weight_decay': 0.0001, 'epochs': 12, 'lr_drop': 11, 'save_checkpoint_interval': 1, 'clip_max_norm': 0.1, 'onecyclelr': False, 'multi_step_lr': False, 'lr_drop_list': [33, 45], 'modelname': 'dino', 'frozen_weights': None, 'backbone': 'swin_L_384_22k', 'use_checkpoint': True, 'dilation': False, 'position_embedding': 'sine', 'pe_temperatureH': 20, 'pe_temperatureW': 20, 'return_interm_indices': [1, 2, 3], 'backbone_freeze_keywords': None, 'enc_layers': 6, 'dec_layers': 6, 'unic_layers': 0, 'pre_norm': False, 'dim_feedforward': 2048, 'hidden_dim': 256, 'dropout': 0.0, 'nheads': 8, 'num_queries': 900, 'query_dim': 4, 'num_patterns': 0, 'pdetr3_bbox_embed_diff_each_layer': False, 'pdetr3_refHW': -1, 'random_refpoints_xy': False, 'fix_refpoints_hw': -1, 'dabdetr_yolo_like_anchor_update': False, 'dabdetr_deformable_encoder': False, 'dabdetr_deformable_decoder': False, 'use_deformable_box_attn': False, 'box_attn_type': 'roi_align', 'dec_layer_number': None, 'num_feature_levels': 4, 'enc_n_points': 4, 'dec_n_points': 4, 'decoder_layer_noise': False, 'dln_xy_noise': 0.2, 'dln_hw_noise': 0.2, 'add_channel_attention': False, 'add_pos_value': False, 'two_stage_type': 'standard', 'two_stage_pat_embed': 0, 'two_stage_add_query_num': 0, 'two_stage_bbox_embed_share': False, 'two_stage_class_embed_share': False, 'two_stage_learn_wh': False, 'two_stage_default_hw': 0.05, 'two_stage_keep_all_tokens': False, 'num_select': 300, 'transformer_activation': 'relu', 'batch_norm_type': 'FrozenBatchNorm2d', 'masks': False, 'aux_loss': True, 'set_cost_class': 2.0, 'set_cost_bbox': 5.0, 'set_cost_giou': 2.0, 'cls_loss_coef': 1.0, 'mask_loss_coef': 1.0, 'dice_loss_coef': 1.0, 'bbox_loss_coef': 5.0, 'giou_loss_coef': 2.0, 'enc_loss_coef': 1.0, 'interm_loss_coef': 1.0, 'no_interm_box_loss': False, 'focal_alpha': 0.25, 'decoder_sa_type': 'sa', 'matcher_type': 'HungarianMatcher', 'decoder_module_seq': ['sa', 'ca', 'ffn'], 'nms_iou_threshold': -1, 'dec_pred_bbox_embed_share': True, 'dec_pred_class_embed_share': True, 'use_dn': True, 'dn_number': 100, 'dn_box_noise_scale': 0.4, 'dn_label_noise_ratio': 0.5, 'embed_init_tgt': True, 'dn_labelbook_size': 5, 'match_unstable_error': True, 'use_ema': False, 'ema_decay': 0.9997, 'ema_epoch': 0, 'use_detached_boxes_dec_out': False, 'backbone_dir': './pretrained_backbone'} use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!! /home/xfz/anaconda3/envs/DINO_env/lib/python3.7/site-packages/torch/functional.py:478: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:2894.) return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined] _IncompatibleKeys(missing_keys=['norm1.weight', 'norm1.bias', 'norm2.weight', 'norm2.bias', 'norm3.weight', 'norm3.bias'], unexpected_keys=['norm.weight', 'norm.bias', 'layers.0.blocks.1.attn_mask', 'layers.1.blocks.1.attn_mask', 'layers.2.blocks.1.attn_mask', 'layers.2.blocks.3.attn_mask', 'layers.2.blocks.5.attn_mask', 'layers.2.blocks.7.attn_mask', 'layers.2.blocks.9.attn_mask', 'layers.2.blocks.11.attn_mask', 'layers.2.blocks.13.attn_mask', 'layers.2.blocks.15.attn_mask', 'layers.2.blocks.17.attn_mask'])

inference code: def inference_img(model,id2name, transform, img_path, out_path): img_dir = os.listdir(img_path) loop = tqdm(enumerate(img_dir), total=len(img_dir))

for i, img_name in loop:
    img_file = os.path.join(img_path, img_name)
    img_num = img_name.split('.')[0]
    image = Image.open(img_file).convert("RGB")  # load image

    image, _ = transform(image, None)

    # predict images
    output = model.cuda()(image[None].cuda())
    output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

    # visualize outputs
    thershold = 0.5  # set a thershold

    vslzr = COCOVisualizer()
    # savei = COCOSaveImager()

    scores = output['scores']
    labels = output['labels']
    boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
    select_mask = scores > thershold

    # box_label = [id2name[int(item)]  for item in labels[select_mask]]
    box_label = [id2name[int(item[0])] + ": " + str(round(float(item[1]), 3))  for item in zip(labels[select_mask], scores[select_mask])]
    # box_score = [float(item) for item in scores[select_mask]]
    pred_dict = {
        'boxes': boxes[select_mask],
        'size': torch.Tensor([image.shape[1], image.shape[2]]),
        'box_label': box_label,
        'image_id': img_num
    }
    vslzr.visualize(image, pred_dict, savedir=out_path, dpi=100)

if name == 'main': model_config_path = "./config/DINO/DINO_4scale_swin.py" # change the path of the model config file model_checkpoint_path = "/media/xfz/8bb024a0-448a-47ad-9f71-d1e3b67fc5c7/DINO_models/DINO_underwater_12_4scale_double/checkpoint_best_regular.pth" # change the path of the model checkpoint backbone_dir = 'backbone_dir=./pretrained_backbone'

args = SLConfig.fromfile(model_config_path)
print(args)

args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

# load coco names
with open('./util/coco_id2name.json') as f:
    id2name = json.load(f)
    id2name = {int(k): v for k, v in id2name.items()}

# transform images
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

img_path_ori_large = '/media/xfz/8bb024a0-448a-47ad-9f71-d1e3b67fc5c7/DINO_detection/large_val_img' out_path_ori_large = '/media/xfz/8bb024a0-448a-47ad-9f71-d1e3b67fc5c7/DINO_detection/large_val_img_detection' inference_img(model, id2name, transform, img_path_ori_large, out_path_ori_large)

ahxiaofengzheng commented 1 year ago

The inference code works fine, and I'm concerned if the lack of ckeckpoint affects the inference results

HaoZhang534 commented 1 year ago

It will not affect the results. This warning happens when you load pre-trained swin backbone. When you load the DINO model, the backbone will be overwritten.

ahxiaofengzheng commented 1 year ago

Thanks for your help.