Closed AlecGuerin closed 1 year ago
After further tests, the issue can be reproduced with different devices. It seems to be triggered by the model and not by specific devices. Since the exception is raised by 'aihwkit/nn/modules/conv.py' should I open a new issue for it ?
Hi @AlecGuerin, thanks for raising this issue. Can you give a (minimal) code listing that reproduces the error? It could be related to a memory issue. Not sure how large this model is your using.
Many thanks!
Hi @maljoras , Thank you for your answer, here is the code to reproduce the issue.
import os
import time
import pickle
from typing import Dict
import urllib.request
import numpy as np
from aihwkit.nn.conversion import (convert_to_analog)
from aihwkit.simulator.configs.devices import (PowStepDevice, PulsedDevice)
from aihwkit.simulator.rpu_base import cuda
from aihwkit.simulator.configs.configs import (SingleRPUConfig, InferenceRPUConfig)
from aihwkit.nn import ( AnalogLinear, AnalogConv1d, AnalogConv2d, AnalogConv3d)
import torch
from torchvision.io import read_image
from torchvision.models.detection.mask_rcnn import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights
import torchvision.transforms.functional as F
# Check device
USE_CUDA = 0
if cuda.is_compiled():
USE_CUDA = 1
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
# Map of analog models
ANALOG_MAP ={AnalogLinear, AnalogConv1d, AnalogConv2d, AnalogConv3d}#AnalogSequential
def model_exists(model_name, root_dir):
"""Check if model dump file exists in root_dir directory.
Args:
model_name (string): Name of the model to check without .dmp at the end.
root_dir (string): Directory to check for the model.
Returns:
(bool): Exists?
"""
return os.path.isfile(os.path.join(root_dir, f'{model_name}.dmp'))
def load_model(model_name, root_dir):
"""Load a dumped model saved with save_model method from root_dir directory.
Args:
model_name (string): Name of the model to load without .dmp at the end.
root_dir (string): Directory to check for the model.
Returns:
(nn.Module): The loaded object or None if not found.
"""
model = None
# Get the dump file
file_name = os.path.join(root_dir, f'{model_name}.dmp')
if not os.path.isfile(file_name):
print(f'file \'{file_name}\' not found.')
else:
print(f'Loading model "{file_name}"...')
file = open(file_name, 'rb')
model = pickle.load(file)
file.close()
print(f'Model successfully loaded from "{file_name}"')
return model
def save_model(model, model_name, root_dir):
"""Save the provided model dump to root_dir directory.
Args:
model (nn.Module): Model to save.
model_name (str): Model name to generate dump file (ex: 'digital_model').
root_dir (string): Directory to use for saving the model.
"""
file_name = os.path.join(root_dir, '{}.dmp'.format(model_name))
# Make cache directory if it doesn't exists
os.makedirs(root_dir, exist_ok=True)
# Dump the model object
with open(file_name, 'wb') as file:
pickle.dump(model, file)
print(f'Model successfully saved to "{file_name}"')
def program_tile_weights(analog_tile, use_cuda=False, max_iter=1000, early_stopping=0.01):
""" Programms a single tile realistically using SGD """
# just get the exactly converted weight from convert_to_analog
# assumes digital_bias = True
target_weight = analog_tile.get_weights(apply_weight_scaling=True)[0]
x = torch.eye(analog_tile.in_size)
target_weight = torch.transpose(target_weight, 1, 0)
if use_cuda:
x = x.cuda()
target_weight = target_weight.cuda()
w_max = target_weight.abs().max().item()
analog_tile.tile.set_weights_uniform_random(-0.01, 0.01)
analog_tile.tile.set_learning_rate(0.1)
for i in range(max_iter):
y = analog_tile.forward(x)
loss = y - target_weight
if (loss.abs().mean().item()/w_max) < early_stopping:
break
analog_tile.update(x, loss)
return analog_tile
def program_analog_tiles_realistically(model, use_cuda=False):
"""Program the analog tiles of the provided analog model.
Args:
model (nn.Module): Analog model to program.
use_cuda (bool): Set the analog model to cuda?
Returns:
(nn.Module): Generated analog model.
Notes:
Method provided from discussion (https://github.com/IBM/aihwkit/discussions/473)
"""
# Set the model weights realisticaly.
for module in model.modules():
for analog_tile in module.analog_tiles():
analog_tile = program_tile_weights(analog_tile, use_cuda=use_cuda)
return model
def set_analog_model_weights(model, use_cuda=False):
"""Set realiticaly the weights of the analog modules in the provided model.
Notes:
Based on aihwkit.nn.conversion.convert_to_analog function.
Args:
model (nn.Module): Model to convert.
use_cuda (bool): Set the analog model to cuda?
Returns:
(nn.Module): The converted model with weights set realisticaly.
"""
# Convert parent.
if model.__class__ in ANALOG_MAP:
model = program_analog_tiles_realistically(model, use_cuda=use_cuda)
# Convert children:
for name, mod in model.named_children():
n_grand_children = len(list(mod.named_children()))
# Check for recursion:
if n_grand_children > 0:
new_mod = set_analog_model_weights(mod, use_cuda=use_cuda)
# Check if is an analog module
elif mod.__class__ in ANALOG_MAP:
new_mod = program_analog_tiles_realistically(mod, use_cuda=use_cuda)
else:
continue
return model
def make_analog_maskrcnn_resnet50_fpn(rpu_config, use_cuda=False, use_dmp=True):
"""Generate an analog converted model of Mask R-CNN proposed by torchvision and set its analog weights realisticaly.
Args:
rpu_config (): RPU config to use for analog model.
use_cuda (bool): Set the analog model to cuda?
use_dmp (bool): Use mdel dump if exists?
Returns:
(nn.Module): Generated model
"""
# Get the cache directory and the experiment dump file name
root_dir = 'cache'
file = '_analog_maskrcnn_resnet50_fpn'
# Run or load the inference
print('Make the model...')
t = time.time()
if model_exists(file, root_dir) and use_dmp:
model = load_model(file, root_dir)
else:
print('Conversion...')
# Get the digital model
digital_model = maskrcnn_resnet50_fpn(weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1)
if use_cuda:
digital_model = digital_model.cuda()
# Convert model to analog
model = convert_to_analog(digital_model, rpu_config)
print(f'Conversion time: {time.time()-t:.3f} s')
t = time.time()
print('Set weights realisticaly...')
# Set realisticaly the weights
model = set_analog_model_weights(model, use_cuda=use_cuda)
print(f'Weights setting time: {time.time()-t:.3f} s')
t = time.time()
# Save the model dump if needed
if use_dmp:
print('Model saving...')
save_model(model, file, root_dir)
print(f'Model saving time: {time.time()-t:.3f} s')
print('Model generated')
return model
if __name__ == '__main__':
# Run MaskRCNN (download image online to test if needed)
fname = 'test_mask_rcnn.jpg'
if not os.path.isfile(fname):
urllib.request.urlretrieve("https://unsplash.com/photos/lEwc9W5eLH0/download?force=true&w=1920", fname)
# Get the image data
im = read_image(fname)
im_input = torch.unsqueeze(im, 0)
im_input = F.convert_image_dtype(im_input, dtype=torch.float)
if USE_CUDA:
im_input = im_input.to(DEVICE)
# Make the RPU configuraton
device = PulsedDevice()
rpu_config = InferenceRPUConfig()
rpu_config.device = device
# Add a lot of noise to generate the issue
rpu_config.forward.inp_noise = 0.5
# Get the analog model
model = make_analog_maskrcnn_resnet50_fpn(rpu_config, use_cuda=USE_CUDA, use_dmp=False)
# Test the model
print('Inference...')
t = time.time()
model.eval()
model(im_input) # This should generate the issue
print(f'Inference time: {time.time()-t:.3f} s')
Thanks for the example.
That does not seem to have anything to do with the AIHWKIT. There is an issue with the mask_features being empty:
803 if self.mask_roi_pool is not None:
804 mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
--> 805 mask_features = self.mask_head(mask_features)
806 mask_logits = self.mask_predictor(mask_features)
807 else:
ipdb> mask_features
tensor([], device='cuda:0', size=(0, 256, 14, 14), grad_fn=<IndexPutBackward0>)
Using empty vector as an input into AnalogConv is not supposed to work. You can just use all-zeros for instance. But it does not make sense in any case I would say.
Thank you for your investigation @maljoras !
Description
I’m currently trying to run the mask R-CNN with analog model but I’m experiencing a division by zero exception for some device parameters. I’m running the model with an InferenceRPUConfig and a PowStepDevice with different ‘dw_min’ values. It seems that the issue occurs only for dw_min > 0.01.
How to reproduce
Other information
Error log
Traceback (most recent call last): File "examples/FTJ_segmentation/maskrcnn.py", line 334, in
output = inference(experiment_name, fname, rpu_config, i)
File "examples/FTJ_segmentation/maskrcnn.py", line 268, in inference
output = model(im_input)[0]
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py", line 105, in forward
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/usr/local/lib/python3.8/dist-packages/torchvision/models/detection/roi_heads.py", line 805, in forward
mask_features = self.mask_head(mask_features)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/aihwkit-0.6.2-py3.8-linux-x86_64.egg/aihwkit/nn/modules/conv.py", line 195, in forward
out = self._forward_indexed(x_input)
File "/usr/local/lib/python3.8/dist-packages/aihwkit-0.6.2-py3.8-linux-x86_64.egg/aihwkit/nn/modules/conv.py", line 164, in _forward_indexed
input_size = x_input.numel() / x_input.size(0)