ModuleValidator.fix() causes layer gradients to be None.

🐛 Bug

After replacing BatchNorm by GroupNorm with ModuleValidator.fix(), the gradients of parameters become None and the grad_sample are also None. The gradient is not flowing to the replaced GroupNorm weights when running backward pass.

model = ModuleValidator.fix(model)

model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=noise_multiplier,
    max_grad_norm=clipping_bound,
    clipping="flat",
)

for name, layer in model.named_parameters():
  if layer.grad is None:
      print(name) # the layer changed by ModuleValidator.fix()
      print(layer.grad_sample) # None

Here is the model.

class Baseline(nn.Module):
    def __init__(
        self,
        in_channels: int = 1,
        out_classes: int = 2,
        dimensions: int = 3,
        num_encoding_blocks: int = 3,
        out_channels_first_layer: int = 8,
        normalization: Optional[str] = "batch",
        pooling_type: str = "max",
        upsampling_type: str = "linear",
        preactivation: bool = False,
        residual: bool = False,
        padding: int = 1,
        padding_mode: str = "zeros",
        activation: Optional[str] = "PReLU",
        initial_dilation: Optional[int] = None,
        dropout: float = 0,
        monte_carlo_dropout: float = 0,
    ):
        super().__init__()
        self.CHANNELS_DIMENSION = 1
        depth = num_encoding_blocks - 1

        # Force padding if residual blocks
        if residual:
            padding = 1

        # Encoder
        self.encoder = Encoder(
            in_channels,
            out_channels_first_layer,
            dimensions,
            pooling_type,
            depth,
            normalization,
            preactivation=preactivation,
            residual=residual,
            padding=padding,
            padding_mode=padding_mode,
            activation=activation,
            initial_dilation=initial_dilation,
            dropout=dropout,
        )

        # Bottom (last encoding block)
        in_channels = self.encoder.out_channels
        if dimensions == 2:
            out_channels_first = 2 * in_channels
        else:
            out_channels_first = in_channels

        self.bottom_block = EncodingBlock(
            in_channels,
            out_channels_first,
            dimensions,
            normalization,
            pooling_type=None,
            preactivation=preactivation,
            residual=residual,
            padding=padding,
            padding_mode=padding_mode,
            activation=activation,
            dilation=self.encoder.dilation,
            dropout=dropout,
        )

        # Decoder
        if dimensions == 2:
            power = depth - 1
        elif dimensions == 3:
            power = depth
        in_channels = self.bottom_block.out_channels
        in_channels_skip_connection = out_channels_first_layer * 2**power
        num_decoding_blocks = depth
        self.decoder = Decoder(
            in_channels_skip_connection,
            dimensions,
            upsampling_type,
            num_decoding_blocks,
            normalization=normalization,
            preactivation=preactivation,
            residual=residual,
            padding=padding,
            padding_mode=padding_mode,
            activation=activation,
            initial_dilation=self.encoder.dilation,
            dropout=dropout,
        )

        # Monte Carlo dropout
        self.monte_carlo_layer = None
        if monte_carlo_dropout:
            dropout_class = getattr(nn, "Dropout{}d".format(dimensions))
            self.monte_carlo_layer = dropout_class(p=monte_carlo_dropout)

        # Classifier
        if dimensions == 2:
            in_channels = out_channels_first_layer
        elif dimensions == 3:
            in_channels = 2 * out_channels_first_layer
        self.classifier = ConvolutionalBlock(
            dimensions, in_channels, out_classes, kernel_size=1, activation=None
        )

    def forward(self, x):
        skip_connections, encoding = self.encoder(x)
        encoding = self.bottom_block(encoding)
        x = self.decoder(skip_connections, encoding)
        if self.monte_carlo_layer is not None:
            x = self.monte_carlo_layer(x)
        x = self.classifier(x)
        x = F.softmax(x, dim=self.CHANNELS_DIMENSION)
        return x

Please reproduce using our [template Colab]() and post here the link

To Reproduce

:warning: We cannot help you without you sharing reproducible code. Do not ignore this part :) Steps to reproduce the behavior:

Expected behavior

Environment

Please copy and paste the output from our environment collection script (or fill out the checklist below manually).

You can get the script and run it with:

wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
# For security purposes, please check the contents of collect_env.py before running it.
python collect_env.py

PyTorch Version (e.g., 1.0):
OS (e.g., Linux):
How you installed PyTorch (conda, pip, source):
Build command you used (if compiling from source):
Python version:
CUDA/cuDNN version:
GPU models and configuration:
Any other relevant information:

pytorch / opacus

ModuleValidator.fix() causes layer gradients to be None. #638

🐛 Bug

Please reproduce using our [template Colab]() and post here the link

To Reproduce

Expected behavior

Environment

Additional context