Estimated bpp is much lower than actual bpp by GaussianConditional Model?

mmSir commented 3 years ago

I create a new entropy model inherited from CompressionModel, similar with MeanScaleHyperprior , which contains an EntropyBottleneck used for modeling side information z and a GaussianConditional used for modeling image latent feature y. I also train a MeanScaleHyperprior model in quality 4, and only use it to generate image latent feature y for my new entropy model.

When I evaluate the MeanScaleHyperprior model, I can get almost the same bpp using model.forward() or model.compress(). But I get different results when evaluate my new entropy model.

Here are the codes.

class SpatialTemporalPriorModel(CompressionModel):

def __init__(self, entropy_bottleneck_channels=256, in_channels=192):
    super().__init__(entropy_bottleneck_channels = entropy_bottleneck_channels)
    self.TPM = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=256, kernel_size=5, padding=5 // 2, stride=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=256, out_channels=320, kernel_size=5, padding=5 // 2, stride=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=320, out_channels=in_channels * 2, kernel_size=5, padding=5 // 2, stride=1),
    )
    self.HE = nn.Sequential(
        nn.Conv2d(in_channels=in_channels * 2, out_channels=256, kernel_size=3, padding=3 // 2, stride=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=256, out_channels=256, kernel_size=5, padding=5 // 2, stride=2),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=256, out_channels=entropy_bottleneck_channels, kernel_size=5, padding=5 // 2, stride=2),
    )
    self.HD = nn.Sequential(
        nn.ConvTranspose2d(in_channels=entropy_bottleneck_channels, out_channels=256, kernel_size=5,
                           padding=5 // 2, stride=2, output_padding=1),
        nn.LeakyReLU(),
        nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=5, padding=5 // 2,
                           stride=2, output_padding=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=256, out_channels=in_channels * 2, kernel_size=3, padding=3 // 2, stride=1),
    )
    self.EPM = nn.Sequential(
        nn.Conv2d(in_channels=in_channels * 2 * 2, out_channels=768, kernel_size=1, padding=1 // 2, stride=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=768, out_channels=576, kernel_size=1, padding=1 // 2, stride=1),
        nn.LeakyReLU(),
        nn.Conv2d(in_channels=576, out_channels=in_channels * 2, kernel_size=1, padding=1 // 2, stride=1),
    )
    self.gaussian_conditional = GaussianConditional(None)

def forward(self, y, y_hat, last_y_hat):
    z = self.HE(torch.cat([y_hat, last_y_hat], 1))
    z_hat, z_likelihoods = self.entropy_bottleneck(z)
    hyperprior_params = self.HD(z_hat)

    temporalprior_params = self.TPM(last_y_hat)

    gaussian_params = self.EPM(torch.cat([temporalprior_params, hyperprior_params], 1)) # Fuse
    scales_hat, means_hat = gaussian_params.chunk(2, 1)

    _, y_likelihoods = self.gaussian_conditional(y, scales_hat, means=means_hat)

    return {
        "likelihoods": {"y": y_likelihoods, "z": z_likelihoods},
    }

def compress(self, y, conditioned_y):
    y_quantized = self.gaussian_conditional.quantize(y, "symbols")
    # y_quantized = y
    z = self.HE(torch.cat([y_quantized, conditioned_y], 1).type(torch.cuda.FloatTensor))

    z_strings = self.entropy_bottleneck.compress(z)
    z_hat = self.entropy_bottleneck.decompress(z_strings, z.size()[-2:])
    hyperprior_params = self.HD(z_hat)

    temporalprior_params = self.TPM(conditioned_y.type(torch.cuda.FloatTensor))

    gaussian_params = self.EPM(torch.cat([temporalprior_params, hyperprior_params], 1))
    scales_hat, means_hat = gaussian_params.chunk(2, 1)
    indexes = self.gaussian_conditional.build_indexes(scales_hat)
    y_strings = self.gaussian_conditional.compress(y, indexes, means=means_hat) 
    return {"strings": [y_strings, z_strings], "shape": z.size()[-2:]}

def decompress(self, strings, shape, conditioned_y):
    assert isinstance(strings, list) and len(strings) == 2
    z_hat = self.entropy_bottleneck.decompress(strings[1], shape)
    hyperprior_params = self.HD(z_hat)

    temporalprior_params = self.TPM(conditioned_y)
    gaussian_params = self.EPM(torch.cat([temporalprior_params, hyperprior_params], 1))
    scales_hat, means_hat = gaussian_params.chunk(2, 1)
    indexes = self.gaussian_conditional.build_indexes(scales_hat)
    y_hat = self.gaussian_conditional.decompress(
        strings[0], indexes, means=means_hat
    )
    return y_hat

def load_state_dict(self, state_dict):
    update_registered_buffers(
        self.gaussian_conditional,
        "gaussian_conditional",
        ["_quantized_cdf", "_offset", "_cdf_length", "scale_table"],
        state_dict,
    )
    super().load_state_dict(state_dict)

# ---------------------------Eval Code-----------------------------
stpm = SpatialTemporalPriorModel()

# load_state_dict ..... update ...... else ......

out_forward = stpm(y, y_quantized, conditioned_y)
estimate_bpp = sum( (torch.log(likelihoods).sum() / (-math.log(2) * num_pixels))  for likelihoods in out_forward["likelihoods"].values() ).item()
estimate_y_bpp = (torch.log(out_forward["likelihoods"]["y"]).sum() / (-math.log(2) * num_pixels)).item()
estimate_z_bpp = (torch.log(out_forward["likelihoods"]["z"]).sum() / (-math.log(2) * num_pixels)).item()

out_enc = stpm.compress(y_quantized, conditioned_y)
bpp = sum(len(s[0]) for s in out_enc["strings"]) * 8.0 / num_pixels
actual_y_bpp = len(out_enc["strings"][0][0]) * 8.0 / num_pixels
actual_z_bpp = len(out_enc["strings"][1][0]) * 8.0 / num_pixels

I use some frames from HEVC Standard Test Sequence B to test and found estimate_z_bpp is quite close to actual_z_bpp while estimate_y_bpp is half to actual_y_bpp. So there is no problem with the EntropyBottleneck but something wrong with the GaussianConditional. I have no idea about it.

Looking forward to your help! Thanks!

mmSir commented 3 years ago

@jbegaint Sorry for the late delivery of more detailed information. I found the problem is in the forward function. If i fed quantized latent y to the Hyperprior net, i would get different bpp between forward() extimation and compress() actual entropy coding. Here are the codes for reproduct my description issue.

def forward(self, x):
    y = self.g_a(x)
    y = self.gaussian_conditional.quantize(y, "noise") # quantized latent y
    z = self.h_a(y)
    z_hat, z_likelihoods = self.entropy_bottleneck(z)
    gaussian_params = self.h_s(z_hat)
    scales_hat, means_hat = gaussian_params.chunk(2, 1)
    y_hat, y_likelihoods = self.gaussian_conditional(y, scales_hat, means=means_hat)
    x_hat = self.g_s(y_hat)

    return {
        "y": y,
        "y_hat": y_hat,
        "x_hat": x_hat,
        "likelihoods": {"y": y_likelihoods, "z": z_likelihoods},
    }

def compress(self, x):
    y = self.g_a(x)
    y = self.gaussian_conditional.quantize(y, "symbols")  # quantized latent y
    z = self.h_a(y)

    z_strings = self.entropy_bottleneck.compress(z)
    z_hat = self.entropy_bottleneck.decompress(z_strings, z.size()[-2:])

    gaussian_params = self.h_s(z_hat)
    scales_hat, means_hat = gaussian_params.chunk(2, 1)
    indexes = self.gaussian_conditional.build_indexes(scales_hat)
    y_strings = self.gaussian_conditional.compress(y, indexes, means=means_hat)  # y_string 是 list, 且只包含一个元素
    return {"strings": [y_strings, z_strings], "shape": z.size()[-2:]}`

Indeed, Balle doesn't do so in the paper (Variational image compression with a scale hyperprior) . But some paper fed the quantized latent to hyperprior net, such as Asymmetric_Gained_Deep_Image_Compression_With_Continuous_Rate_Adaptation - 2021 CVPR , SpatioTemporal Entropy Model is All You Need For LVC . In my understanding, this detail should not affect the final result too much. I just wonder why this would affect the bpp estimation?

jbegaint commented 3 years ago

Hi, you probably have a bug somewhere in your entropy modelling with possibly a mix up with quantized/noisy/original tensors. If you can pinpoint an issue with the GaussianConditional implementation, I can take a look.

mmSir commented 3 years ago

Thanks for the answer! I'll check my code. :)

InterDigitalInc / CompressAI

Estimated bpp is much lower than actual bpp by GaussianConditional Model? #68