Open sdbds opened 2 months ago
@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.
@sdbds Can you create a minimal reproduction of the memory leak? When I do with just PyTorch and optimi memory usage is steady. See the example below.
Minimal gradient release training example
import torch import torch.nn as nn import torch.optim as optim import torchvision import torchvision.transforms as transforms from optimi import AdamW from optimi.gradientrelease import prepare_for_gradient_release, remove_gradient_release # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyper-parameters num_epochs = 100 batch_size = 256 learning_rate = 0.001 # Image preprocessing modules transform = transforms.Compose( [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] ) # CIFAR-10 dataset train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform) test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform) # Data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Convolutional neural network (VGG-like) class ConvNet(nn.Module): def __init__(self): super(ConvNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(4096, 512), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(512, 512), nn.ReLU(inplace=True), nn.Linear(512, 10), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x model = ConvNet().to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=learning_rate, gradient_release=True) prepare_for_gradient_release(model, optimizer) # Train the model for epoch in range(num_epochs): epoch_max_memory = 0 epoch_max_reserved_memory = 0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, labels) # Backward pass and optimize with gradient release loss.backward() # Track maximum memory usage epoch_max_memory = max(epoch_max_memory, torch.cuda.max_memory_allocated()) epoch_max_reserved_memory = max(epoch_max_reserved_memory, torch.cuda.max_memory_reserved()) # Convert bytes to megabytes epoch_max_memory = epoch_max_memory / (1024 * 1024) epoch_max_reserved_memory = epoch_max_reserved_memory / (1024 * 1024) print( f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Max Allocated Memory: {epoch_max_memory:.2f} MB, Max Reserved Memory: {epoch_max_reserved_memory:.2f} MB" )
Thanks for the reply, I will minimize the code as soon as possible to reproduce the issue.
Here is my implement https://github.com/kohya-ss/sd-scripts/pull/1381
Im not sure if it is correct. https://github.com/kohya-ss/sd-scripts/blob/ed99b2180148258cde955106ce988781eca03006/sdxl_train.py#L502-L510