very slow in multi-gpu - Githubissues

YoungDav commented 4 years ago

Environmental for requests numpy==1.15.4 graphviz==0.8.4 torch==1.0.0 torchvision==0.2.1 tensorboard==1.13.0 tensorboardX==1.6

when I use 4 gpus(GPU-Util about 4-21%), time will be very very slower than only use 1 gpu(GPU-Util about 30-90%)？

JiyueWang commented 4 years ago

I have the same problem. Have you solved it?

YoungDav commented 4 years ago

I have the same problem. Have you solved it?

no

mk-minchul commented 4 years ago

I faced the same problem and found a solution by using DistributedDataParallel from Nvidia apex.

luzai commented 4 years ago

Hi @mk-minchul , could you share the solution of DistributedDataParallel? It would be quite helpful to increase the batch size and speedup search process by multi-gpu. Thank you very much!

innovatedmonster commented 12 months ago

Faced with the same problem, I don't know why the original method using DataParallel worked slower than singel gpu ones. But it can run faster with multi-GPU when DistributedDataParallel is used.

（1）the following code block is not from the original ones；（2）there are 3 main parts: prepare, main method and multi-process to start; （3）strongly recommend the reference from: https://blog.csdn.net/weixin_41978699/article/details/121412128

# 1.prepare the env
import torch.distributed as dist
import torch.utils.data.distributed
from torch.multiprocessing import Process

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12335'
device = torch.device("cuda:0")

#2.main method for each gpu
def main(rank):
    #数据并行,启动多进程
    dist.init_process_group("nccl", rank=rank, world_size=len(config.gpus))#config.gpus 是一个列表[0,1,2,...]，每个数字代表一个gpu

    #准备数据集和模型
    input_size, input_channels, n_classes, train_data = utils.get_data(
        config.dataset, config.data_path, cutout_length=0, validation=False)

    net_crit = nn.CrossEntropyLoss().to(device)
    model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers,
                                net_crit, device_ids=config.gpus)

    # weights optimizer
    w_optim = torch.optim.SGD(model.weights(), config.w_lr, momentum=config.w_momentum,
                              weight_decay=config.w_weight_decay)
    # alphas optimizer
    alpha_optim = torch.optim.Adam(model.alphas(), config.alpha_lr, betas=(0.5, 0.999),
                                   weight_decay=config.alpha_weight_decay)

    # split data to train/validation
    n_train = len(train_data)
    split = n_train // 2
    indices = list(range(n_train))

    train_dataset = torch.utils.data.Subset(train_data, indices[:split])
    val_dataset = torch.utils.data.Subset(train_data, indices[split:])
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    valid_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)

    #数据并行，处理数据2，如果不指定，不会对batch_size进行划分，见https://blog.csdn.net/weixin_41978699/article/details/121412128
    #这里有个坑，指定了batch_sampler，就不能指定batch_size, shuffle, sampler, and drop_last
    #重要：数据并行中，BatchSampler制定了每个gpu运行的batch_size，一个epoch的总batch_size为batch_sampler的*gpu个数
    train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, config.batch_size, drop_last=True)
    valid_batch_sampler = torch.utils.data.BatchSampler(valid_sampler, config.batch_size, drop_last=True)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                            #    batch_size=config.batch_size,
                                               batch_sampler=train_batch_sampler,
                                            #    sampler=train_sampler,
                                               num_workers=config.workers,
                                               pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(val_dataset,
                                            #    batch_size=config.batch_size,
                                               batch_sampler=valid_batch_sampler,
                                            #    sampler=valid_sampler,
                                               num_workers=config.workers,
                                               pin_memory=True)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optim, config.epochs, eta_min=config.w_lr_min)

    model = model.to(device)
    #新版数据并行
    if torch.cuda.device_count() > 1:
        print(f'Using {torch.cuda.device_count()} GPUs.')
        model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])#注意本句位置

    architect = Architect(model, config.w_momentum, config.w_weight_decay)
    #省略...

#3.multi-process to run on multi-gpu
if __name__ == "__main__":
    size = len(config.gpus)
    processes = []
    for rank in range(size):
        device = torch.device("cuda:" + str(rank))
        p = Process(target=main, args=(rank,))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

notice: (1)after model = nn.parallel.DistributedDataParallel(model, device_ids=[rank]) , model should be replaced with model.module, cause model has been embeded into the DistributedDataParallel (2)for DataParallel working, we need to indicate how to divide the batch_size to each gpu, so it is necessary to use train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) and train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, config.batch_size, drop_last=True)and

 train_loader = torch.utils.data.DataLoader(train_dataset,
                                            #    batch_size=config.batch_size,
                                               batch_sampler=train_batch_sampler,
                                            #    sampler=train_sampler,
                                               num_workers=config.workers,
                                               pin_memory=True)

khanrc / pt.darts

very slow in multi-gpu #29