MadryLab / robustness

A library for experimenting with, training and evaluating neural networks, with a focus on adversarial robustness.
MIT License
903 stars 181 forks source link

Training on Custom Dataset triggers error : CUDA error: device-side assert triggered #97

Open elcronos opened 3 years ago

elcronos commented 3 years ago

I'm using your Python package to adversarially train a ResNet50 network for CelebA dataset. My code looks something like this:

import torch
torch.manual_seed(42)
import torchvision
from torchvision import transforms
from torch import nn
from robustness.datasets import CelebA
from robustness.model_utils import make_and_restore_model
from cox.utils import Parameters
from cox import store
from robustness import model_utils, datasets, train, defaults

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.RandomErasing()
])

celeba_train = torchvision.datasets.CelebA('./celeba', split='train', target_type='identity',transform=transform)
celeba_val = torchvision.datasets.CelebA('./celeba', split='valid', target_type='identity',transform=transform)

train_loader = torch.utils.data.DataLoader(celeba_train, batch_size=16, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(celeba_val, batch_size=16, shuffle=True, num_workers=4)

ds = CelebA('./celeba')
m, _ = make_and_restore_model(arch='resnet50', pytorch_pretrained=False,  dataset=ds)
# Create a cox store for logging
OUT_DIR = './outputs'
out_store = store.Store(OUT_DIR)

train_kwargs = {
    'out_dir': "train_out",
    'adv_train': 1,
    'constraint': '2',
    'eps': 0.01,
    'attack_lr': 0.005,
    'attack_steps': 40,
    'epochs': 100
}

train_args = Parameters(train_kwargs)

# Fill whatever parameters are missing from the defaults
train_args = defaults.check_and_fill_args(train_args,
                        defaults.TRAINING_ARGS, CelebA)
train_args = defaults.check_and_fill_args(train_args,
                        defaults.PGD_ARGS, CelebA)

# Train a model
train.train_model(train_args, m, (train_loader, val_loader), store=out_store)

My definition for CelebA in robustness/datasets.py is:

class CelebA(DataSet):
    def __init__(self, data_path,**kwargs):
        self.num_classes = 10177

        ds_kwargs = {
            'num_classes': self.num_classes,
            'mean': torch.tensor([0.5061, 0.4254, 0.3828]),
            'std': torch.tensor([0.2658, 0.2452, 0.2413]),
            'custom_class': datasets.CelebA,
            'transform_train': da.TRAIN_TRANSFORMS_IMAGENET,
            'transform_test': da.TEST_TRANSFORMS_IMAGENET
        }
        super(CelebA, self).__init__('celeba', data_path, **ds_kwargs)

    def get_model(self, arch, pretrained=False):

        return imagenet_models.__dict__[arch](num_classes=self.num_classes,
                                        pretrained=pretrained)

Once I run my code it starts training as expected but then after a while I get this error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-cade5f159ff1> in <module>
     18 
     19 # Train a model
---> 20 train.train_model(train_args, m, (train_loader, val_loader), store=out_store)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in train_model(args, model, loaders, checkpoint, dp_device_ids, store, update_params, disable_no_grad)
    309     for epoch in range(start_epoch, args.epochs):
    310         # train for one epoch
--> 311         train_prec1, train_loss = _model_loop(args, 'train', train_loader, 
    312                 model, opt, epoch, args.adv_train, writer)
    313         last_epoch = (epoch == (args.epochs - 1))

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in _model_loop(args, loop_type, loader, model, opt, epoch, adv, writer)
    445        # measure data loading time
    446         target = target.cuda(non_blocking=True)
--> 447         output, final_inp = model(inp, target=target, make_adv=adv,
    448                                   **attack_kwargs)
    449         loss = train_criterion(output, target)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    163 
    164         if len(self.device_ids) == 1:
--> 165             return self.module(*inputs[0], **kwargs[0])
    166         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
    167         outputs = self.parallel_apply(replicas, inputs, kwargs)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, inp, target, make_adv, with_latent, fake_relu, no_relu, with_image, **attacker_kwargs)
    311             prev_training = bool(self.training)
    312             self.eval()
--> 313             adv = self.attacker(inp, target, **attacker_kwargs)
    314             if prev_training:
    315                 self.train()

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, x, target, constraint, eps, step_size, iterations, random_start, random_restarts, do_tqdm, targeted, custom_loss, should_normalize, orig_input, use_best, return_image, est_grad, mixed_precision, *_)
    247             adv_ret = to_ret
    248         else:
--> 249             adv_ret = get_adv_examples(x)
    250 
    251         return adv_ret

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in get_adv_examples(x)
    202                         x.grad.zero_()
    203                     elif (est_grad is None):
--> 204                         grad, = ch.autograd.grad(m * loss, [x])
    205                     else:
    206                         f = lambda _x, _y: m * calc_loss(step.to_image(_x), _y)[0]

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
    221         retain_graph = create_graph
    222 
--> 223     return Variable._execution_engine.run_backward(
    224         outputs, grad_outputs_, retain_graph, create_graph,
    225         inputs, allow_unused, accumulate_grad=False)

RuntimeError: CUDA error: device-side assert triggered

In addition, when I try to change the batch_size I get the error:

RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR

Any insight on why is this happening and how to solve it?