warner-benjamin / optimi

Fast, Modern, Memory Efficient, and Low Precision PyTorch Optimizers
https://optimi.benjaminwarner.dev
MIT License
53 stars 2 forks source link

When using accelerator with Gradient Release hook, Increasing VRAM consumption #5

Open sdbds opened 2 months ago

sdbds commented 2 months ago

Here is my implement https://github.com/kohya-ss/sd-scripts/pull/1381

Im not sure if it is correct. https://github.com/kohya-ss/sd-scripts/blob/ed99b2180148258cde955106ce988781eca03006/sdxl_train.py#L502-L510

warner-benjamin commented 2 months ago

@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.

Minimal gradient release training example ```python import torch import torch.nn as nn import torch.optim as optim import torchvision import torchvision.transforms as transforms from optimi import AdamW from optimi.gradientrelease import prepare_for_gradient_release, remove_gradient_release # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyper-parameters num_epochs = 100 batch_size = 256 learning_rate = 0.001 # Image preprocessing modules transform = transforms.Compose( [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] ) # CIFAR-10 dataset train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform) test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform) # Data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Convolutional neural network (VGG-like) class ConvNet(nn.Module): def __init__(self): super(ConvNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(4096, 512), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(512, 512), nn.ReLU(inplace=True), nn.Linear(512, 10), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x model = ConvNet().to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=learning_rate, gradient_release=True) prepare_for_gradient_release(model, optimizer) # Train the model for epoch in range(num_epochs): epoch_max_memory = 0 epoch_max_reserved_memory = 0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, labels) # Backward pass and optimize with gradient release loss.backward() # Track maximum memory usage epoch_max_memory = max(epoch_max_memory, torch.cuda.max_memory_allocated()) epoch_max_reserved_memory = max(epoch_max_reserved_memory, torch.cuda.max_memory_reserved()) # Convert bytes to megabytes epoch_max_memory = epoch_max_memory / (1024 * 1024) epoch_max_reserved_memory = epoch_max_reserved_memory / (1024 * 1024) print( f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Max Allocated Memory: {epoch_max_memory:.2f} MB, Max Reserved Memory: {epoch_max_reserved_memory:.2f} MB" ) ```
sdbds commented 2 months ago

@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.

Minimal gradient release training example

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from optimi import AdamW
from optimi.gradientrelease import prepare_for_gradient_release, remove_gradient_release

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyper-parameters
num_epochs = 100
batch_size = 256
learning_rate = 0.001

# Image preprocessing modules
transform = transforms.Compose(
    [
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
)

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)

# Data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Convolutional neural network (VGG-like)
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(4096, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

model = ConvNet().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, gradient_release=True)
prepare_for_gradient_release(model, optimizer)

# Train the model
for epoch in range(num_epochs):
    epoch_max_memory = 0
    epoch_max_reserved_memory = 0
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimize with gradient release
        loss.backward()

        # Track maximum memory usage
        epoch_max_memory = max(epoch_max_memory, torch.cuda.max_memory_allocated())
        epoch_max_reserved_memory = max(epoch_max_reserved_memory, torch.cuda.max_memory_reserved())

    # Convert bytes to megabytes
    epoch_max_memory = epoch_max_memory / (1024 * 1024)
    epoch_max_reserved_memory = epoch_max_reserved_memory / (1024 * 1024)
    print(
        f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Max Allocated Memory: {epoch_max_memory:.2f} MB, Max Reserved Memory: {epoch_max_reserved_memory:.2f} MB"
    )

Thanks for the reply, I will minimize the code as soon as possible to reproduce the issue.