Closed ChenyiZhang007 closed 1 month ago
In fact, 120 epochs is enough for training. I wrote 300 epochs casually when summarizing the code. However, single-card training is very slow. I can give the code for single-machine multi-card distributed training.
ok, looking forward to the update
import random
import warnings
import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import model
import torch.nn as nn
import dataset
import laploss
# CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 train_dist.py
parser = argparse.ArgumentParser(description='AEMatter Training')
parser.add_argument('-j',
'--workers',
default=4,
type=int,
metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--local_rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ')
best_acc1 = 0
def main_worker(gpu, ngpus_per_node, args=None):
global best_acc1
torch.backends.cudnn.benchmark = True
dist.init_process_group(backend='nccl')
h_dataset = dataset.BasicData(1024)
h_trainsampler = torch.utils.data.distributed.DistributedSampler(h_dataset)
train_loader = torch.utils.data.DataLoader(h_dataset,batch_size=1, num_workers=2, shuffle=None, drop_last=True,sampler=h_trainsampler)
def weighted_loss(pd, gt, wl=0.9, epsilon=1e-12, tri=None):
bs, _, h, w = pd.shape
alpha_gt = gt.view(bs, 1, h, w)
tri = tri.view(bs, 1, h, w)
diff_alpha0 = (pd - alpha_gt).float() * (tri == 1)
loss_alpha2 = torch.sqrt(diff_alpha0 * diff_alpha0 + epsilon)
sums = (torch.sum(tri == 1) + 50.)
loss_alpha = loss_alpha2.sum() / sums
return loss_alpha
def get_param(model):
nodecay = {'params': [], 'weight_decay': 0}
decay = {'params': [], 'weight_decay': 1e-6}
for name, param in model.named_parameters():
if 'start_conv' in name:
nodecay['params'].append(param)
elif 'bias' in name:
nodecay['params'].append(param)
elif 'convo' in name:
nodecay['params'].append(param)
elif 'conv5' in name:
nodecay['params'].append(param)
elif 'conv4' in name:
nodecay['params'].append(param)
elif 'conv3' in name:
nodecay['params'].append(param)
else:
decay['params'].append(param)
return [nodecay, decay]
segmodel = model.AEMatter()
segmodel = segmodel.cuda(gpu)
segmodel = torch.nn.parallel.DistributedDataParallel(segmodel, device_ids=[args.local_rank],find_unused_parameters=True)
segmodel.train()
we = torch.tensor([0.001, 1, 0.002]).cuda(gpu)
optim_g = torch.optim.RAdam(get_param(segmodel), 2.0*1e-5,betas=(0.5,0.999))
sl = torch.optim.lr_scheduler.CosineAnnealingLR(optim_g,120,1e-7)
idx = 0
l1loss=nn.L1Loss().cuda(gpu)
mloss=laploss.lap_loss().cuda(gpu)
scaler=torch.cuda.amp.GradScaler()
def focalc(outputs, targets):
alpha = 1
gamma = 2
ce_loss = torch.nn.functional.cross_entropy(outputs, targets,
reduction='none')
pt = torch.exp(-ce_loss)
focal_loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()
return focal_loss
for epoch in range(125):
print('PreTrain_Start', epoch)
id = 0
L = 0
L_tri = 0
L_alpha1 = 0
L_alpha2 = 0
L_edge = 0
L_img = 0
L_mask = 0
for _, data in enumerate(train_loader):
bgt2 ,mgt,mgt2,Tfseg,Talpha,fgt= data
optim_g.zero_grad()
_,_,h,w=mgt.shape
mgt=mgt.cuda(gpu, non_blocking=True)
Talpha=Talpha.cuda(gpu,non_blocking=True)
Tfseg=Tfseg.cuda(gpu,non_blocking=True)
optim_g.zero_grad()
with torch.cuda.amp.autocast():
lastpred=segmodel(mgt,Tfseg)
alpha=lastpred[:,0:1]*Tfseg[:,1:2]+Tfseg[:,2:3]
lossm = mloss(alpha, Talpha)
loss_alpha=l1loss(alpha,Talpha)
loss_i=weighted_loss(alpha,Talpha,tri=Tfseg[:,1:2])
loss=loss_alpha*0.5+loss_i*0.5+lossm*0.5
scaler.scale(loss).backward()
scaler.unscale_(optim_g)
torch.nn.utils.clip_grad_norm_(segmodel.parameters(), 10.)
scaler.step(optim_g)
scaler.update()
id += 1
L += loss.item()
L_tri += loss_alpha.item()
L_alpha1 += loss_alpha.item()
L_alpha2 += lossm.item()
L_edge += lossm.item()
L_img += loss_i.item()
L_mask += loss_i.item()
if id % 100 == 0 and id > 0:
print('Epoch', epoch, 'Total_Los', L / 100.,'Alpha1Loss',L_alpha1/100,'Alpha2Loss',L_alpha2/100)
L = 0
id = 0
L_tri = 0
L_alpha1 = 0
L_alpha2 = 0
L_fg = 0
L_bg = 0
L_img2 = 0
L_img = 0
if gpu==0 and epoch>25:
torch.save(segmodel.module.state_dict(), './ckpt/' + str(epoch//1) +'_' +str (gpu)+'aem.ckpt')
sl.step()
if __name__ == '__main__':
args = parser.parse_args()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
main_worker(args.local_rank, args.workers, args)
Thanks for your response! I wonder if the learning rate should be adjusted according to the batchsize. And the batchsize and the number of GPUs used in the main paper.
I remember that I have multiplied the initial learning rate by the number of GPUs.
thanks a lot
Following official settings, it takes me 45 days to train 300 epochs, Is something wrong?