Open HarryXD2018 opened 6 months ago
Update, I figured out a solution by:
dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
in the main() funcThen, I ran into the error below:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/a2p_env/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/home/xxx/code/audio2photoreal/train/train_diffusion.py", line 77, in main
TrainLoop(
File "/home/xxx/code/audio2photoreal/train/training_loop.py", line 151, in run_loop
self.run_step(motion, cond)
File "/home/xxx/code/audio2photoreal/train/training_loop.py", line 175, in run_step
self.forward_backward(batch, cond)
File "/home/xxx/code/audio2photoreal/train/training_loop.py", line 201, in forward_backward
losses = compute_losses()
File "/home/xxx/code/audio2photoreal/diffusion/respace.py", line 110, in training_losses
return super().training_losses(self._wrap_model(model), *args, **kwargs)
File "/home/xxx/code/audio2photoreal/diffusion/respace.py", line 121, in _wrap_model
return _WrappedModel(
File "/home/xxx/code/audio2photoreal/diffusion/respace.py", line 135, in __init__
self.add_frame_cond = model.add_frame_cond
File "/home/xxx/miniconda3/envs/a2p_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1614, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'DistributedDataParallel' object has no attribute 'add_frame_cond'
My modified train_diffusion.py
is shown as below for your reference:
"""
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import json
import os
import torch
import torch.multiprocessing as mp
from data_loaders.get_data import get_dataset_loader, load_local_data
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
from torch.utils.tensorboard import SummaryWriter
from train.train_platforms import ClearmlPlatform, NoPlatform, TensorboardPlatform
from train.training_loop import TrainLoop
from utils.diff_parser_utils import train_args
from utils.misc import cleanup, fixseed, setup_dist
from utils.model_util import create_model_and_diffusion
def main(rank: int, world_size: int):
args = train_args()
fixseed(args.seed)
train_platform_type = eval(args.train_platform_type)
train_platform = train_platform_type(args.save_dir)
train_platform.report_args(args, name="Args")
setup_dist(args.device)
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '9000'
# Initialize the distributed environment
dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
if rank == 0:
if args.save_dir is None:
raise FileNotFoundError("save_dir was not specified.")
elif os.path.exists(args.save_dir) and not args.overwrite:
raise FileExistsError("save_dir [{}] already exists.".format(args.save_dir))
elif not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
args_path = os.path.join(args.save_dir, "args.json")
with open(args_path, "w") as fw:
json.dump(vars(args), fw, indent=4, sort_keys=True)
if not os.path.exists(args.data_root):
args.data_root = args.data_root.replace("/home/", "/derived/")
data_dict = load_local_data(args.data_root, audio_per_frame=1600)
print("creating data loader...")
data = get_dataset_loader(args=args, data_dict=data_dict)
print("creating logger...")
writer = SummaryWriter(args.save_dir)
print("creating model and diffusion...")
model, diffusion = create_model_and_diffusion(args, split_type="train")
model.to(rank)
if world_size > 1:
model = DDP(
model, device_ids=[rank], output_device=rank, find_unused_parameters=True
)
params = (
model.module.parameters_w_grad()
if world_size > 1
else model.parameters_w_grad()
)
print("Total params: %.2fM" % (sum(p.numel() for p in params) / 1000000.0))
print("Training...")
TrainLoop(
args, train_platform, model, diffusion, data, writer, rank, world_size
).run_loop()
train_platform.close()
cleanup()
if __name__ == "__main__":
world_size = torch.cuda.device_count()
print(f"using {world_size} gpus")
if world_size > 1:
mp.spawn(main, args=(world_size,), nprocs=world_size, join=True)
else:
main(rank=0, world_size=1)
Update: now I finally successfully executed the code, with the last modification as:
self.add_frame_cond = False
Note that this is only a solution for training tasks with data_format face
.
One more hint for the those who are trying multiple GPU, you may change
to
cp = torch.load(cp_path, map_location='cpu')
These will save a lot of memory for GPU #0.
:)
Hi!! Sorry for the delay, and thanks so much for debugging and finding the solutions to this issue! I really appreciate the active effort on this! :) Please let me know if there's anything else I can help with.
One more hint for the those who are trying multiple GPU, you may change
to
cp = torch.load(cp_path, map_location='cpu')
These will save a lot of memory for GPU #0.
:)
What is the total training time and how many GPUs are used? What GPUs are these?
Thanks
Do you encounter nan error when training the model for face?
well, you can just change like this
Hi, when I was trying to train the model (
train.train_diffusion.py
)with multiple GPUs (tested on V100s and 2080Tis), I ran into the error below:My training command is:
Do you have any idea? Many thanks!