microsoft / nni

An open source AutoML toolkit for automate machine learning lifecycle, including feature engineering, neural architecture search, model compression and hyper-parameter tuning.
https://nni.readthedocs.io
MIT License
13.99k stars 1.81k forks source link

torch.cuda.OutOfMemoryError occurred when running to ModelSpeedup. #5436

Closed 587687525 closed 1 year ago

587687525 commented 1 year ago

Describe the bug: torch.cuda.OutOfMemoryError occurred when running to ModelSpeedup.

Environment:

Reproduce the problem

import nni import numpy as np import torch import tqdm from PIL import Image from nni.algorithms.compression.v2.pytorch import TorchEvaluator from nni.algorithms.compression.v2.pytorch.pruning import AutoCompressPruner from nni.compression.pytorch.speedup import ModelSpeedup from torch.nn import ParameterList from torch.nn import functional as F from torch.optim import Adam from torchvision import transforms

from lib.RhNet import RhNet_SwinB from utils.eval import evaluate_acc from utils.misc import load_config from lib.transforms import dynamic_resize, tonumpy, normalize, totensor # 误报未引用依赖

filepath = osp.split(osp.abspath(file))[0] repopath = osp.split(filepath)[0] sys.path.append(repopath)

torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False

def criterion(input, target): return input['loss']

def training_func(model, optimizer, criterion, lr_schedulers=None, max_steps=None, max_epochs=None): print(f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}] Starting Training.")

def evaluating_func(model): print(f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}] Starting evaluation.") model.eval()

img_list = random.sample(glob(f"{args.original_path}\\*"), 500)

tfs = opt.Infer.transforms
comp = []
for key, value in zip(tfs.keys(), tfs.values()):
    if value is not None:
        tf = eval(key)(**value)
    else:
        tf = eval(key)()
    comp.append(tf)
transform = transforms.Compose(comp)

time_sum = 0
acc_list = []
for idx in tqdm.tqdm(range(len(img_list))):
    with open(img_list[idx], 'rb') as f:

        original = Image.open(f).convert('RGB')
        shape = original.size[::-1]
        name = osp.basename(img_list[idx])[:-4]
        label = np.array(Image.open(args.label_path + '\\' + name + '.png').convert('1'))

        inputs = {'image': original, 'name': name, 'shape': shape, 'original': original}
        inputs = transform(inputs)
        inputs['image'] = inputs['image'].unsqueeze(0)
        if 'image_resized' in inputs.keys():
            inputs['image_resized'] = inputs['image_resized'].unsqueeze(0)

        for key in inputs.keys():
            if type(inputs[key]) == torch.Tensor:
                inputs[key] = inputs[key].cuda()

        with torch.no_grad():
            time_start = time.time()
            out = model(inputs)
            time_sum = time_sum + (time.time() - time_start)

        pred = F.interpolate(out['pred'], inputs['shape'], mode='bilinear', align_corners=True)
        pred = pred.data.cpu().numpy().squeeze()

        pred = (pred * 255).astype(np.uint8)
        acc = evaluate_acc(label, pred)
        acc_list.append(acc)

print(f"Accuracy: {sum(acc_list) / len(acc_list)} ; Time: {time_sum}")
return acc

def main(opt, args): model = RhNet_SwinB(**opt.Model) model.load_state_dict(torch.load(args.weights, map_location=torch.device('cpu')), strict=True)

model = model.cuda()

backbone_params = ParameterList()
decoder_params = ParameterList()

for name, param in model.named_parameters():
    if 'backbone' in name:
        backbone_params.append(param)
    else:
        decoder_params.append(param)

params_list = [{'params': backbone_params}, {
    'params': decoder_params, 'lr': opt.Train.Optimizer.lr * 10}]

config_list = [{
    'sparsity_per_layer': 0.25,
    'op_types': ['Conv2d', 'Linear']
}, {
    'exclude': True,
    'op_names': ['backbone.patch_embed.proj']
}]

traced_optimizer = nni.trace(Adam)(params_list, opt.Train.Optimizer.lr,
                                   weight_decay=opt.Train.Optimizer.weight_decay)

dummy_input = torch.randn(4, 3, 512, 512).to(torch.device('cuda'))

evaluator = TorchEvaluator(training_func, optimizers=traced_optimizer, criterion=criterion,
                           dummy_input=dummy_input, evaluating_func=evaluating_func)

admm_params = {
    'evaluator': evaluator,
    'iterations': 5,
    'training_epochs': 2
}
sa_params = {
    'evaluator': evaluator
}

pruner = AutoCompressPruner(model=model, config_list=config_list, total_iteration=3, admm_params=admm_params,
                            sa_params=sa_params, log_dir='./log', keep_intermediate_result=True,
                            evaluator=evaluator, speedup=True)

pruner.compress()
_, model, masks, _, _ = pruner.get_best_result()

torch.save(model.state_dict(), osp.join(opt.Train.Checkpoint.checkpoint_dir, 'compressed.pth'))

if name == 'main': parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=r"./weights/RhineSOD.pth", help="weights path") parser.add_argument('--gpu', '-g', action='store_true', default=True) parser.add_argument('--config', '-c', type=str, default='configs/RhineSOD.yaml') parser.add_argument('--imgsize', type=int, default=320, help='input image size') parser.add_argument('--thres', type=int, default=50) parser.add_argument('--original_path', type=str, default=r"G:\ML-Dataset\DUTS-TR\images", help="input image path") parser.add_argument('--label_path', type=str, default=r"G:\ML-Dataset\DUTS-TR\masks", help="input image path") parser.add_argument('--mask_path', type=str, default="./outputs/mask", help="output masked path") args = parser.parse_args()

opt = load_config(args.config)
main(opt, args)

```Traceback (most recent call last):
  File "G:\Project\PC\CV\Rhine-SOD\compress.py", line 199, in <module>
    main(opt, args)
  File "G:\Project\PC\CV\Rhine-SOD\compress.py", line 180, in main
    pruner.compress()
  File "G:\Environment\Anaconda\lib\site-packages\nni\algorithms\compression\v2\pytorch\base\scheduler.py", line 194, in compress
    task_result = self.pruning_one_step(task)
  File "G:\Environment\Anaconda\lib\site-packages\nni\algorithms\compression\v2\pytorch\pruning\basic_scheduler.py", line 283, in pruning_one_step
    result = self.pruning_one_step_normal(task)
  File "G:\Environment\Anaconda\lib\site-packages\nni\algorithms\compression\v2\pytorch\pruning\basic_scheduler.py", line 151, in pruning_one_step_normal
    ModelSpeedup(compact_model, self.evaluator.get_dummy_input(), pruner_generated_masks).speedup_model()
  File "G:\Environment\Anaconda\lib\site-packages\nni\compression\pytorch\speedup\compressor.py", line 73, in __init__
    self.torch_graph = build_module_graph(model, self.dummy_input)
  File "G:\Environment\Anaconda\lib\site-packages\nni\common\graph_utils.py", line 25, in build_module_graph
    return TorchModuleGraph(model, dummy_input)
  File "G:\Environment\Anaconda\lib\site-packages\nni\common\graph_utils.py", line 265, in __init__
    super().__init__(model, dummy_input, traced_model)
  File "G:\Environment\Anaconda\lib\site-packages\nni\common\graph_utils.py", line 67, in __init__
    self._trace(model, dummy_input)
  File "G:\Environment\Anaconda\lib\site-packages\nni\common\graph_utils.py", line 91, in _trace
    self.trace = torch.jit.trace(model, dummy_input, **kw_args)
  File "G:\Environment\Anaconda\lib\site-packages\torch\jit\_trace.py", line 759, in trace
    return trace_module(
  File "G:\Environment\Anaconda\lib\site-packages\torch\jit\_trace.py", line 976, in trace_module
    module._c._create_method_from_trace(
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1182, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "G:\Project\PC\CV\Rhine-SOD\lib\RhNet.py", line 152, in forward_inference
    out = self.forward_inspyre(sample)
  File "G:\Project\PC\CV\Rhine-SOD\lib\RhNet.py", line 83, in forward_inspyre
    x1 = self.context1(x1)  # 4
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1182, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "G:\Project\PC\CV\Rhine-SOD\lib\modules\context_module.py", line 48, in forward
    x2 = self.branch2(x)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1182, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "G:\Project\PC\CV\Rhine-SOD\lib\modules\context_module.py", line 20, in forward
    x = self.conv2(x)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1182, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "G:\Project\PC\CV\Rhine-SOD\lib\modules\layers.py", line 127, in forward
    x = self.conv(x)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\module.py", line 1182, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\conv.py", line 463, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "G:\Environment\Anaconda\lib\site-packages\torch\nn\modules\conv.py", line 459, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 15.99 GiB total capacity; 14.52 GiB already allocated; 0 bytes free; 15.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
J-shang commented 1 year ago

please have a try with:

    pruner = AutoCompressPruner(model=model, config_list=config_list, total_iteration=3, admm_params=admm_params,
                                sa_params=sa_params, log_dir='./log', keep_intermediate_result=True,
                                evaluator=evaluator, speedup=False)

ban the speedup during pruning, and add the speedup manually after _, model, masks, _, _ = pruner.get_best_result().

587687525 commented 1 year ago

please have a try with:

    pruner = AutoCompressPruner(model=model, config_list=config_list, total_iteration=3, admm_params=admm_params,
                                sa_params=sa_params, log_dir='./log', keep_intermediate_result=True,
                                evaluator=evaluator, speedup=False)

ban the speedup during pruning, and add the speedup manually after _, model, masks, _, _ = pruner.get_best_result().

@J-shang I have changed the code, but the error still appears.

pruner = AutoCompressPruner(model=model, config_list=config_list, total_iteration=3, admm_params=admm_params,
                                sa_params=sa_params, log_dir='./log', keep_intermediate_result=True,
                                evaluator=evaluator, speedup=False)
    # pruner = LevelPruner(model, config_list)

    pruner.compress()
    _, model, masks, _, _ = pruner.get_best_result()
    ModelSpeedup(model, torch.randn(4, 3, 512, 512).to(torch.device('cuda')),
                 masks).speedup_model()

    torch.save(model.state_dict(), osp.join(opt.Train.Checkpoint.checkpoint_dir, 'compressed.pth'))
J-shang commented 1 year ago

hello @587687525 , I think you already got solution in the wechat group, put the model on cpu or give a small confidence is both ok, 😃

Lijiaoa commented 1 year ago

Could you close this issue if you have not any other problems? @587687525 thanks

donjuanpond commented 1 month ago

I'm having the same issue when putting to speedup TensorRT - can someone explain the solution???