IntelLabs / FP8-Emulation-Toolkit

PyTorch extension for emulating FP8 data formats on standard FP32 Xeon/GPU hardware.
BSD 3-Clause "New" or "Revised" License
100 stars 10 forks source link

TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType #12

Open Siris-Li opened 11 months ago

Siris-Li commented 11 months ago

Hello, I intergrated your fp8 emulator with my Lenet (2 conv layers, 3 fc layers) training process.

When I set list_exempt_layers = ["conv1"], everything works well. However, when I set list_exempt_layers = ["fc1"], i.e. exclude all conv layers, the code will report such error TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType. It seems I must include at least one conv layer in list_exempt_layers to run correctly.

My environment is Python 3.9, torch=2.1, cuda=12.3

Code is here:

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from mpemu import mpt_emu
try:
    from apex import amp
except ImportError:
    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")

# ----------------------- 1. Load and normalize CIFAR10 ---------------------- #

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# ----------------- 2. Define a Convolutional Neural Network ----------------- #

class LeNet(nn.Module):

    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)  # 2 is the same as (2,2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]         # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

model = LeNet()
model.to(device)

# ------------------ 3. Define a Loss function and optimizer ----------------- #

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# --------------------------- 4. Train the network --------------------------- #

# layers exempt from FP8 conversion
list_exempt_layers = ["fc1"]

# fused layers will be exempt from converting output tensor to FP8, the following layer will read from FP32 buffer.
list_layers_output_fused = None

# use 'direct' training method, Options : direct, hybrid
model, optimizer = amp.initialize(model, optimizer,
                                    opt_level="O2",
                                    keep_batchnorm_fp32=True
                                    )
model, emulator = mpt_emu.initialize(model, optimizer, training_algo="hybrid",
                              list_exempt_layers=list_exempt_layers, list_layers_output_fused=list_layers_output_fused,
                              device="cuda", verbose=True)

for epoch in range(2):  # loop over the dataset multiple times

    emulator.update_global_steps(epoch*len(trainloader))
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device) # use GPU

        # zero the parameter gradients
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # forward + backward + optimize
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        emulator.optimizer_step(optimizer)

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')
PATH = './cifar_net.pth'
torch.save(model.state_dict(), PATH)

# ------------------- 5. Test the network on the test data ------------------- #
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted==labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' %
      (100*correct/total))
Siris-Li commented 11 months ago

I have just read your arXiv'19 paper, "Mixed Precision Training With 8-bit Floating Point". In Section 4. Experiments and Results, there is one sentence saying: "For these convolution networks, the first convolution and the last fully-connected (FC) layers are maintained at a higher precision (16-bit) to maintain the model accuracy." Is that the reason of the NoneType error?

wzzll123 commented 1 week ago

I meet the same issue and I found the reason for this is that the first layer's grad_input is always None:

import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple DNN
class SimpleDNN(nn.Module):
    def __init__(self):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to check whether the gradient input is None
def check_gradients(module, grad_input, grad_output):
    grad_input_none = [gi is None for gi in grad_input]
    print(f"{module.__class__.__name__} grad_input contains None: {grad_input_none}")

# Initialize the model, loss function, and optimizer
model = SimpleDNN()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Register backward hooks for each layer
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module.register_full_backward_hook(check_gradients)

# Dummy input and target
inputs = torch.randn(5, 10)  # Batch of 5, input size 10
targets = torch.randn(5, 1)  # Batch of 5, target size 1

# Forward pass
outputs = model(inputs)
loss = criterion(outputs, targets)

# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()

And the output is:

Linear grad_input contains None: [False]
Linear grad_input contains None: [False]
Linear grad_input contains None: [True]