AMP fails for cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv

Abermal commented 1 year ago

Instructions To Reproduce the Issue and Full Logs:

from detectron2.modeling import build_model
from detectron2.model_zoo import get_config
from torch.cuda.amp import autocast
# conf = get_config("Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml") # this works just fine
conf = get_config("Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml")
model = build_model(conf)
image_size = (3, 1024, 807)

model.eval() # set to eval to bypass the instance generation
batch = [{"image": torch.rand(*image_size, dtype=torch.float16)}]
with autocast():
    model(batch)

raises

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_17369/2205532709.py in <module>
     10 batch = [{"image": torch.rand(*image_size, dtype=torch.float16)}]
     11 with autocast():
---> 12     model(batch)

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/modeling/meta_arch/rcnn.py in forward(self, batched_inputs)
    144         """
    145         if not self.training:
--> 146             return self.inference(batched_inputs)
    147 
    148         images = self.preprocess_image(batched_inputs)

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/modeling/meta_arch/rcnn.py in inference(self, batched_inputs, detected_instances, do_postprocess)
    198 
    199         images = self.preprocess_image(batched_inputs)
--> 200         features = self.backbone(images.tensor)
    201 
    202         if detected_instances is None:

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/modeling/backbone/fpn.py in forward(self, x)
    124                 ["p2", "p3", ..., "p6"].
    125         """
--> 126         bottom_up_features = self.bottom_up(x)
    127         results = []
    128         prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/modeling/backbone/resnet.py in forward(self, x)
    447             outputs["stem"] = x
    448         for name, stage in zip(self.stage_names, self.stages):
--> 449             x = stage(x)
    450             if name in self._out_features:
    451                 outputs[name] = x

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
    115     def forward(self, input):
    116         for module in self:
--> 117             input = module(input)
    118         return input
    119 

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/modeling/backbone/resnet.py in forward(self, x)
    313         else:
    314             offset = self.conv2_offset(out)
--> 315             out = self.conv2(out, offset)
    316         out = F.relu_(out)
    317 

/opt/conda/envs/env/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/layers/deform_conv.py in forward(self, x, offset)
    389             self.dilation,
    390             self.groups,
--> 391             self.deformable_groups,
    392         )
    393         if self.norm is not None:

/opt/conda/envs/env/lib/python3.7/site-packages/detectron2/layers/deform_conv.py in forward(ctx, input, offset, weight, stride, padding, dilation, groups, deformable_groups, im2col_step)
     76                 ctx.groups,
     77                 ctx.deformable_groups,
---> 78                 cur_im2col_step,
     79             )
     80         return output

RuntimeError: expected scalar type Half but found Float

Also I've noticed that

>>> model.preprocess_image(batch).tensor.dtype
... torch.float32

for both models.

Environment:

Linux 18 LTS python 3.7.8 detectron2 0.5+cu101

github-actions[bot] commented 1 year ago

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template. The following information is missing: "Instructions To Reproduce the Issue and Full Logs";

Asiachanel commented 1 year ago

Yeah. That i need. Tks

facebookresearch / detectron2

AMP fails for cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv #5099

Instructions To Reproduce the Issue and Full Logs:

Environment: