kuanzi commented 5 years ago

It is ok for small scale models like MLP/LeNet5. However, when it comes to vgg16/ resnet18, it will always produce nan loss. The model structure configuration is below:

` cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], }

class VGG_CIFAR10_BAY(nn.Module): kl_list = [] def init(self, vgg_name): super(VGG_CIFAR10_BAY, self).init() self.features = self._make_layers(cfg[vgg_name]) linear_index = BayesianLayer.LinearGroupNJ(512, 10, clip_var=0.04, cuda=True) self.classifier = linear_index self.kl_list.append(linear_index)

def forward(self, x): out = self.features(x) out = out.view(out.size(0), -1) out = self.classifier(out) return out

def _make_layers(self, cfg): layers = [] in_channels = 3 for x in cfg: if x == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: conv_index = BayesianLayer.Conv2dGroupNJ(in_channels, x, kernel_size=3, padding=1, clip_var=0.04, cuda=True) layers += [conv_index, nn.BatchNorm2d(x), nn.ReLU(inplace=True)] self.kl_list.append(conv_index) in_channels = x layers += [nn.AvgPool2d(kernel_size=1, stride=1)] return nn.Sequential(*layers)

def get_masks(self,thresholds):

import pdb

# pdb.set_trace()
weight_masks = []
mask = None
layers = self.kl_list
for i, (layer, threshold) in enumerate(zip(layers, thresholds)):
    # compute dropout mask
    if len(layer.weight_mu.shape) > 2:
        if mask is None:
            mask = [True]*layer.in_channels
        else:
            mask = np.copy(next_mask)

        log_alpha = layers[i].get_log_dropout_rates().cpu().data.numpy()
        next_mask = log_alpha <= thresholds[i]

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
        weight_mask = weight_mask[:,:,None,None]
    else:
        if mask is None:
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
        elif len(weight_mask.shape) > 2:
            temp = next_mask.repeat(layer.in_features/next_mask.shape[0])
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
            #mask = mask | temp  ##Upper bound for number of weights at first fully connected layer
            mask = mask & temp   ##Lower bound for number of weights at fully connected layer
        else:
            mask = np.copy(next_mask)

        try:
            log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
            next_mask = log_alpha <= thresholds[i + 1]
        except:
            # must be the last mask
            next_mask = np.ones(10)

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
    weight_masks.append(weight_mask.astype(np.float))
return weight_masks

def model_kl_div(self): KLD = 0 for layer in self.kl_list: KLD += layer.layer_kl_div() return KLD `

Does it cause by high variance? But I have tried to clip variance, it doesn't work...

kuanzi commented 5 years ago

Sorry, the format cannot be adjusted well... Has anyone been in a similar nan situation?

gullalc commented 5 years ago

Unfortunately with a larger network like VGG, loss becomes nan because of vanishing gradients problem. I believe only authors can help in this case.

KarenUllrich / Tutorial_BayesianCompressionForDL

Anyone encounter loss nan? #11

import pdb