Int8 quantized model performs worse or similar to non quantized fp32 or fp16 model.

I am using a pretrained model from timm for convnextv2. It comprises of layer norm and globalresponsenormalization layer but even after adding custom quant modules for layer norm , layer norm 2d and global response norm (grn) I still can't make my model run faster than base model with fp16 engine. I am using python extension for tensorrt and using model OPT to perform the quantization.

My code for creating custom modules are as follows :

class QuantLayerNorm(LayerNorm):
    def __init__(self, normalized_shape):
        super().__init__(normalized_shape)
        self._setup()

    def _setup(self):
        self.input_quantizer = TensorQuantizer()
        self.weight_quantizer = TensorQuantizer()

    def forward(self, input):
        input = self.input_quantizer(input)
        weight = self.weight_quantizer(self.weight)
        return F.layer_norm(input, self.normalized_shape, weight, self.bias, self.eps)

class QuantLayerNorm2d(LayerNorm2d):
    def __init__(self, normalized_shape):
        super().__init__(normalized_shape)
        self._setup()

    def _setup(self):
        self.input_quantizer = TensorQuantizer()
        self.weight_quantizer = TensorQuantizer()

    def forward(self, input):
        input = self.input_quantizer(input)
        weight = self.weight_quantizer(self.weight)
        input = input.permute(0, 2, 3, 1)
        input = F.layer_norm(input, self.normalized_shape, weight, self.bias, self.eps)
        input = input.permute(0, 3, 1, 2)
        return input

class QuantGlobalResponseNorm(GlobalResponseNorm):
    """Quantized Global Response Normalization layer with Tensor Quantizers."""

    def __init__(self, dim, eps=1e-6, channels_last=True):
        super().__init__()
        self.eps = eps
        if channels_last:
            self.spatial_dim = (1, 2)
            self.channel_dim = -1
            self.wb_shape = (1, 1, 1, -1)
        else:
            self.spatial_dim = (2, 3)
            self.channel_dim = 1
            self.wb_shape = (1, -1, 1, 1)

        self.weight = nn.Parameter(torch.zeros(dim))
        self.bias = nn.Parameter(torch.zeros(dim))

        # Setup quantizers
        self._setup()

    def _setup(self):
        self.input_quantizer = TensorQuantizer()
        self.weight_quantizer = TensorQuantizer()

        # self.bias_quantizer = TensorQuantizer()

    def forward(self, x):
        x = self.input_quantizer(x)
        quant_weight = self.weight_quantizer(self.weight)

        # quant_bias = self.bias_quantizer(self.bias)
        quant_bias = self.bias  

        x_g = x.norm(p=2, dim=self.spatial_dim, keepdim=True)
        x_n = x_g / (x_g.mean(dim=self.channel_dim, keepdim=True) + self.eps)
        return x + torch.addcmul(
            quant_bias.view(self.wb_shape), 
            quant_weight.view(self.wb_shape), 
            x * x_n
        )

mtq.register(original_cls=LayerNorm, quantized_cls=QuantLayerNorm)
mtq.register(original_cls=LayerNorm2d, quantized_cls=QuantLayerNorm2d)
mtq.register(original_cls=GlobalResponseNorm, quantized_cls=QuantGlobalResponseNorm)

I am using the following config :

{'quant_cfg': {'*weight_quantizer': {'num_bits': 8, 'axis': 0},
  '*input_quantizer': {'num_bits': 8, 'axis': None},
  '*lm_head*': {'enable': False},
  '*block_sparse_moe.gate*': {'enable': False},
  '*router*': {'enable': False},
  '*output_layer*': {'enable': False},
  'output.*': {'enable': False},
  'nn.BatchNorm1d': {'*': {'enable': False}},
  'nn.BatchNorm2d': {'*': {'enable': False}},
  'nn.BatchNorm3d': {'*': {'enable': False}},
  'nn.LeakyReLU': {'*': {'enable': False}},
  'default': {'enable': False},
  '*output_quantizer': {'num_bits': 8, 'axis': None},
  'LayerNorm2d': {'*': {'enable': True}},
  'LayerNorm': {'*': {'enable': True}},
  'GlobalResponseNorm': {'*': {'enable': True}}},
 'algorithm': 'max'}

I am going by the documentation but it is not clear to me if I am doing something wrong. Help is much appreciated.

NVIDIA / TensorRT

Int8 quantized model performs worse or similar to non quantized fp32 or fp16 model. #4180