Open f2010126 opened 1 year ago
The tutorial given here using DP Pytorch and Ray doesn't work when gpus_per_trial>1. Link: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
Using the code from the tutorial without any changes
from functools import partial import os import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms from ray import tune from ray.air import Checkpoint, session from ray.tune.schedulers import ASHAScheduler def load_data(data_dir="./data"): transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) trainset = torchvision.datasets.CIFAR10( root=data_dir, train=True, download=True, transform=transform ) testset = torchvision.datasets.CIFAR10( root=data_dir, train=False, download=True, transform=transform ) return trainset, testset class Net(nn.Module): def __init__(self, l1=120, l2=84): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, l1) self.fc2 = nn.Linear(l1, l2) self.fc3 = nn.Linear(l2, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = torch.flatten(x, 1) # flatten all dimensions except batch x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) checkpoint = session.get_checkpoint() if checkpoint: checkpoint_state = checkpoint.to_dict() start_epoch = checkpoint_state["epoch"] net.load_state_dict(checkpoint_state["net_state_dict"]) optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) else: start_epoch = 0 trainset, testset = load_data(data_dir) test_abs = int(len(trainset) * 0.8) train_subset, val_subset = random_split( trainset, [test_abs, len(trainset) - test_abs] ) trainloader = torch.utils.data.DataLoader( train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 ) valloader = torch.utils.data.DataLoader( val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 ) print('Starting training') for epoch in range(start_epoch, 1): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() print(f'Stepped') # print statistics running_loss += loss.item() epoch_steps += 1 if i % 2000 == 1999: # print every 2000 mini-batches print( "[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps) ) running_loss = 0.0 # Validation loss val_loss = 0.0 val_steps = 0 total = 0 correct = 0 for i, data in enumerate(valloader, 0): with torch.no_grad(): inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) outputs = net(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() loss = criterion(outputs, labels) val_loss += loss.cpu().numpy() val_steps += 1 checkpoint_data = { "epoch": epoch, "net_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } checkpoint = Checkpoint.from_dict(checkpoint_data) session.report( {"loss": val_loss / val_steps, "accuracy": correct / total}, checkpoint=checkpoint, ) print("Finished Training") def test_accuracy(net, device="cpu"): trainset, testset = load_data() testloader = torch.utils.data.DataLoader( testset, batch_size=4, shuffle=False, num_workers=2 ) correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data images, labels = images.to(device), labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() return correct / total def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): data_dir = os.path.abspath("./data") load_data(data_dir) config = { "l1": tune.choice([2**i for i in range(9)]), "l2": tune.choice([2**i for i in range(9)]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([2, 4, 8, 16]), } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2, ) result = tune.run( partial(train_cifar, data_dir=data_dir), resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, ) best_trial = result.get_best_trial("loss", "min", "last") print(f"Best trial config: {best_trial.config}") print(f"Best trial final validation loss: {best_trial.last_result['loss']}") print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}") best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) best_checkpoint = best_trial.checkpoint.to_air_checkpoint() best_checkpoint_data = best_checkpoint.to_dict() best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc)) if __name__ == "__main__": # You can change the number of GPUs per trial here: main(num_samples=3, max_num_epochs=10, gpus_per_trial=2)
Expected The run works using 2 GPUs per trial.
Observed The code starts, the data loaders are created. But when creating the distributed process group, everything hangs.
Env: OS: Ubuntu 22.04 on a SLURM Cluster Python: 3.10 torch==2.0.1 ray ==2.6.3
NVCC -V output:
Cuda compilation tools, release 11.7, V11.7.99 Build cuda_11.7.r11.7/compiler.31442593_0
Nvidia-smi output
NVIDIA-SMI 530.30.02 Driver Version: 530.30.02 CUDA Version: 12.1
cc @sekyondaMeta @svekars @carljparker @NicolasHug @kit1980 @subramen
/assigntome
This issue has been unassigned due to inactivity. If you are working on this issue, assign it to yourself and send a PR ASAP.
Add Link
The tutorial given here using DP Pytorch and Ray doesn't work when gpus_per_trial>1. Link: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
Describe the bug
Using the code from the tutorial without any changes
Expected The run works using 2 GPUs per trial.
Observed The code starts, the data loaders are created. But when creating the distributed process group, everything hangs.
Describe your environment
Env: OS: Ubuntu 22.04 on a SLURM Cluster Python: 3.10 torch==2.0.1 ray ==2.6.3
NVCC -V output:
Nvidia-smi output
cc @sekyondaMeta @svekars @carljparker @NicolasHug @kit1980 @subramen