adap / flower

Flower: A Friendly Federated Learning Framework
https://flower.ai
Apache License 2.0
4.59k stars 801 forks source link

Fatal Python error: Aborted #3165

Open GBX-Engineer opened 4 months ago

GBX-Engineer commented 4 months ago

Describe the bug

After successfully installing Flower, I followed the run example to get this error. (Versions python=1.10.0, pytorch=2.2.1,Flower=1.7.0) Error picture

Steps/Code to Reproduce

from collections import OrderedDict from typing import Dict, List, Optional, Tuple

import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from torch.utils.data import DataLoader, random_split from torchvision.datasets import CIFAR10

import flwr as fl

DEVICE = torch.device("cpu") # Try "cuda" to train on GPU print( f"Training on {DEVICE} using PyTorch {torch.version} and Flower {fl.version}" )

NUM_CLIENTS = 10

def load_datasets(num_clients: int):

Download and transform CIFAR-10 (train and test)

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
trainset = CIFAR10("./dataset", train=True, download=True, transform=transform)
testset = CIFAR10("./dataset", train=False, download=True, transform=transform)

# Split training set into `num_clients` partitions to simulate different local datasets
partition_size = len(trainset) // num_clients
lengths = [partition_size] * num_clients
datasets = random_split(trainset, lengths, torch.Generator().manual_seed(42))

# Split each partition into train/val and create DataLoader
trainloaders = []
valloaders = []
for ds in datasets:
    len_val = len(ds) // 10  # 10 % validation set
    len_train = len(ds) - len_val
    lengths = [len_train, len_val]
    ds_train, ds_val = random_split(ds, lengths, torch.Generator().manual_seed(42))
    trainloaders.append(DataLoader(ds_train, batch_size=32, shuffle=True))
    valloaders.append(DataLoader(ds_val, batch_size=32))
testloader = DataLoader(testset, batch_size=32)
return trainloaders, valloaders, testloader

trainloaders, valloaders, testloader = load_datasets(NUM_CLIENTS)

class Net(nn.Module): def init(self) -> None: super(Net, self).init() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 5 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10)

def forward(self, x: torch.Tensor) -> torch.Tensor:
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = x.view(-1, 16 * 5 * 5)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

def getparameters(net) -> List[np.ndarray]: return [val.cpu().numpy() for , val in net.state_dict().items()]

def set_parameters(net, parameters: List[np.ndarray]): params_dict = zip(net.state_dict().keys(), parameters) state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) net.load_state_dict(state_dict, strict=True)

def train(net, trainloader, epochs: int): """Train the network on the training set.""" criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(net.parameters()) net.train() for epoch in range(epochs): correct, total, epoch_loss = 0, 0, 0.0 for images, labels in trainloader: images, labels = images.to(DEVICE), labels.to(DEVICE) optimizer.zero_grad() outputs = net(images) loss = criterion(net(images), labels) loss.backward() optimizer.step()

Metrics

        epoch_loss += loss
        total += labels.size(0)
        correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
    epoch_loss /= len(trainloader.dataset)
    epoch_acc = correct / total
    print(f"Epoch {epoch+1}: train loss {epoch_loss}, accuracy {epoch_acc}")

def test(net, testloader): """Evaluate the network on the entire test set.""" criterion = torch.nn.CrossEntropyLoss() correct, total, loss = 0, 0, 0.0 net.eval() with torch.nograd(): for images, labels in testloader: images, labels = images.to(DEVICE), labels.to(DEVICE) outputs = net(images) loss += criterion(outputs, labels).item() , predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() loss /= len(testloader.dataset) accuracy = correct / total return loss, accuracy

class FlowerClient(fl.client.NumPyClient): def init(self, cid, net, trainloader, valloader): self.cid = cid self.net = net self.trainloader = trainloader self.valloader = valloader

def get_parameters(self, config):
    print(f"[Client {self.cid}] get_parameters")
    return get_parameters(self.net)

def fit(self, parameters, config):
    print(f"[Client {self.cid}] fit, config: {config}")
    set_parameters(self.net, parameters)
    train(self.net, self.trainloader, epochs=1)
    return get_parameters(self.net), len(self.trainloader), {}

def evaluate(self, parameters, config):
    print(f"[Client {self.cid}] evaluate, config: {config}")
    set_parameters(self.net, parameters)
    loss, accuracy = test(self.net, self.valloader)
    return float(loss), len(self.valloader), {"accuracy": float(accuracy)}

def client_fn(cid) -> FlowerClient: net = Net().to(DEVICE) trainloader = trainloaders[int(cid)] valloader = valloaders[int(cid)] return FlowerClient(cid, net, trainloader, valloader)

client_resources = None if DEVICE.type == "cuda": client_resources = {"num_gpus": 1}

Create FedAvg strategy

strategy = fl.server.strategy.FedAvg( fraction_fit=1.0, # Sample 100% of available clients for training fraction_evaluate=0.5, # Sample 50% of available clients for evaluation min_fit_clients=10, # Never sample less than 10 clients for training min_evaluate_clients=5, # Never sample less than 5 clients for evaluation min_available_clients=10, # Wait until all 10 clients are available )

Specify the resources each of your clients need. By default, each

client will be allocated 1x CPU and 0x GPUs

client_resources = {"num_cpus": 1, "num_gpus": 0.0} if DEVICE.type == "cuda":

here we are assigning an entire GPU for each client.

client_resources = {"num_cpus": 1, "num_gpus": 1.0}
# Refer to our documentation for more details about Flower Simulations
# and how to setup these `client_resources`.

Start simulation

fl.simulation.start_simulation( client_fn=client_fn, num_clients=NUM_CLIENTS, config=fl.server.ServerConfig(num_rounds=5), strategy=strategy, client_resources=client_resources, )

Expected Results

Solve this problem

Actual Results

None

jafermarq commented 4 months ago

@GBX-Engineer, what example were you following?