devzhk / cgds-package

Package for CGD and ACGD optimizers
https://pypi.org/project/CGDs
MIT License
19 stars 4 forks source link

bug, #3

Open zr-bee opened 1 year ago

zr-bee commented 1 year ago

hi. I have the problem.

image

Shall you help me? Here is my code

This is my definition of optimizer. I don't know if there's a problem.

` class Network(nn.Module):

def __init__(self, criterion, cfg, **kwargs):

    super(Network, self).__init__()
    self.HRNet = HighResolutionNet(cfg, **kwargs)
    self.HRNet.init_weights('/mnt/mountA/dzr/segcgd/pretrain/hrnet_w48_pascal_context_cls59_480x480.pth') # 这里是用初始化权重,用于测试(已经训练好了的)如果想要重新训练的话,直接注释掉即可
    self.CoutHR = self.HRNet.last_inp_channels

    self.dishsi = DisHSI()

    self.respG = RespG()   # 响应函数  返回的是响应谱和传入的MSI已经进行了相乘
    self.hsiG = HSIG()    # 重建网络   这两个是生成器     想把这两个网络进行合并为生成器
    # self.RgbG = RGBG()
    self.iter = 1

    self.extractor = Fea_extra(self.CoutHR+31, cfg.DATASET.NUM_CLASSES)  # 这个是一个特征提取,在cgd优化器不用管他,

    # 将生成器进行整合,便于训练
    self.generator = nn.Sequential(
        self.respG,
        self.hsiG,
    )

    # 这是优化器 cgd优化器  这是整体的,有一些不方便
    rank = torch.distributed.get_rank()
    self.g = DDP(self.generator.cuda(), device_ids=[rank], broadcast_buffers=False)
    self.d = DDP(self.dishsi.cuda(), device_ids=[rank], broadcast_buffers=False)
    g_reducer = self.g.reducer
    d_reducer = self.d.reducer

    self.cgd_optimizer = CGDs.ACGD(max_params=self.g.parameters(),
                                   min_params=self.d.parameters(),
                                   lr_max=1e-3, lr_min=1e-3,
                                   max_reducer=g_reducer, min_reducer=d_reducer,
                                   tol=1e-4, atol=1e-8)
    # self.cgd_optimizer = CGDs.ACGD(max_params=itertools.chain(self.hsiG.parameters(), self.respG.parameters()),
    #                                min_params=self.dishsi.parameters(),
    #                                lr_max=1e-3, lr_min=1e-3,
    #                                # max_reducer=g_reducer, min_reducer=d_reducer,
    #                                tol=1e-4, atol=1e-8)

    # 分别定义了优化器

    # self.gen_optimizer = torch.optim.Adam(itertools.chain(self.hsiG.parameters(), self.respG.parameters(), self.extractor.parameters()),lr=1e-4)
    # self.gen_optimizer = torch.optim.Adam(itertools.chain(self.hsiG.parameters(), self.respG.parameters()),lr=1e-4)
    # self.gen_optimizer = torch.optim.Adam(itertools.chain(self.hsiG.parameters(), self.respG.parameters()), lr=1e-4)   # 生成器太弱了  这个1
    # # self.dis_optimizer = torch.optim.Adam(itertools.chain(self.dishsi.parameters(), self.dishsi_line.parameters()),lr=1e-3)
    # self.dis_optimizer = torch.optim.Adam(itertools.chain(self.dishsi.parameters()), lr=1e-6)   # 鉴别器太强了 lr = 5e-4  这个2

    self.criterion_class = criterion
    # self.criterion = nn.BCELoss()
    self.criterion = nn.BCEWithLogitsLoss()  # 改动
    self.mean = np.array([0.485, 0.456, 0.406])

    self.std = np.array([0.229, 0.224, 0.225])

    self.gradnorm = GradNorm().apply`

This is the code for the training cycle.

def train(config, epoch, num_epoch, ### epoch_iters, base_lr, num_iters,

     trainloader, optimizer, model, writer_dict, device, Logger=None):

# Training
model.train()
# model.HRNet.eval()
# model.hsiG.eval()
# model.hsiG.eval()
# model.respG.eval()

batch_time = AverageMeter()
ave_loss = AverageMeter()
tic = time.time()
cur_iters = epoch*epoch_iters
writer = writer_dict['writer']
global_steps = writer_dict['train_global_steps']
rank = get_rank()
world_size = get_world_size()

# # 将生成器进行整合,便于训练
# generator = nn.Sequential(
#     model.module.respG,
#     model.module.hsiG,
# )
# cgd_optimizer = CGDs.ACGD(max_params=generator.parameters(),
#                                min_params=model.module.dishsi.parameters(),
#                                lr_max=1e-3, lr_min=1e-3,
#                                # max_reducer=g_reducer, min_reducer=d_reducer,
#                                tol=1e-4, atol=1e-8)

loss_g = []
loss_d = []
GP_List = []
smooth_List = []
res_List = []
# 计数器
gen_train_count = 0
dis_train_count = 0
for i_iter, batch in enumerate(trainloader):
    images, labels, _, _, MSI, HSI = batch  # 获得一个batch中的各个数据
    # images, labels, _, _,MSI = batch
    model.zero_grad()    # 梯度置为零
    model.module.cgd_optimizer.zero_grad()
    # cgd_optimizer.zero_grad()

    images = images.to(device).float()   # 将输入的图像数据(通常为RGB图像)转化为GPU上的张量,并且数据类型转化为float类型
    MSI = MSI.to(device).float()   # 将多光谱图像数据转化为GPU上的张量,数据类型依旧转化为float类型
    labels = labels.long().to(device)  # 将标签数据转化为GPU上的张量,将数据类型转化为long类型
    HSI = HSI.to(device).float()    #  同理,将高光谱数据也转化为GPU上的张量,数据类型转化为float类型

    loss,_,_ = model(images,MSI,labels)
    loss_d_, GP_loss = model.module.update_discriminator(MSI = MSI, HSI = HSI, rank = 0)  # 多卡 加了一个module  更新鉴别器
    loss_g_, smooth_loss, res_loss, gen_loss = model.module.update_generator(MSI = MSI, HSI = HSI, img = images, rank = 0, seg_label=labels)   # 更新生成器损失
    # loss = np.mean(np.array(loss_g_))

    model.module.cgd_optimizer.step(loss_d_)   # 调用更新步长
    # cgd_optimizer.step(loss_d_)   # 调用更新步长

    loss_d_ = loss_d_.item()
    loss_g.append(loss_g_.item() - gen_loss.item())
    loss_d.append(loss_d_)
    GP_List.append(GP_loss)
    smooth_List.append(smooth_loss)
    res_List.append(res_loss)

    optimizer.step()  # sgd优化器的更新

    # measure elapsed time
    batch_time.update(time.time() - tic)
    tic = time.time()

    # update average loss
    ave_loss.update(loss_g_.item() - gen_loss.item())

    # gen_lr = adjust_learning_rate(model.gen_optimizer,
    #                           1e-4,
    #                           num_iters,
    #                           0)

    dis_lr = adjust_learning_rate(optimizer,
                              base_lr,
                              num_iters,
                              i_iter+cur_iters)

`

zr-bee commented 1 year ago

I think there is something wrong with these two loss functions. I don't know if this is the case.

`

 def Cal_generatorloss(self,MSI,HSI,rank):

    real_label1 = 1.0
    fake_label1 = 0.0
    one = torch.FloatTensor([1]).to(MSI.device)
    mone = one * -1
    mone.to(one.device)
    b_size, c, h, w = HSI.shape   # 获取维度
    # length = 
    real_label = torch.full((b_size,), real_label1, dtype=torch.float, device=HSI.device)
    fake_label = torch.full((b_size,), fake_label1, dtype=torch.float, device=HSI.device)
    real_label2 = torch.full((b_size*h*w,), real_label1, dtype=torch.float, device=HSI.device)
    fake_label2 = torch.full((b_size*h*w,), fake_label1, dtype=torch.float, device=HSI.device)

    # MSI --> Resp
    resp_msi = self.respG(MSI)   # 经过响应函数的多光谱数据

    # MSI + resp --> HSI
    fake_HSI,res_1 = self.hsiG(MSI, resp_msi)  # 这是与重建网络进行结合  然后返回一个res_1  残差?  这是获得了一个伪高光谱

    res_loss = torch.mean(res_1**2)
    # res_loss = torch.mean(torch.abs(res_1))

    # domain loss(2) HSI loss:
    [B,C,H,W] = fake_HSI.shape

    fea = fake_HSI.reshape([B,C,H*W])
    pos = torch.abs(fake_HSI) - fake_HSI
    pos = torch.sum(pos,dim=[1,2,3]).mean()
    smooth = self.first_order(fea)
    smooth = torch.mean(smooth**2)
    # smooth = torch.mean(torch.abs(smooth))
    # max_value = torch.amax(fake_HSI,dim=[1,2,3],keepdim=True)
    mean_vlaue = torch.mean(torch.abs(fake_HSI),dim=[1,2,3],keepdim=True)
    mean_vlaue1 = mean_vlaue.detach()+1e-6
    HSI_pred = self.dishsi( self.gradnorm(fake_HSI, torch.ones(1, device = fake_HSI.device))/mean_vlaue1)
    HSI_pred = torch.squeeze(HSI_pred)
    HSI_dis_loss = self.criterion(HSI_pred,real_label)

    loss = HSI_dis_loss + res_loss*1e2 + smooth*1e1 #+ pos*1e-2

    if self.iter %20 ==0:
        if rank == 0:
            # print('[iter: {}/991][Gen loss:{:.4f}], [HSI adversarial loss:{:.4f}], [res loss:{:.4f}] [smooth loss :{:.4f}]'.format(self.iter%991,loss.item(), HSI_dis_loss.mean().item(), res_loss.mean().item(),smooth.mean().item()))
            logging.info('[iter: {}/371][Gen loss:{:.4f}], [HSI adversarial loss:{:.4f}], [res loss:{:.4f}] [smooth loss :{:.4f}] [pos loss:{:.4f}]'.format(self.iter%371,loss.item(), HSI_dis_loss.mean().item(), res_loss.mean().item(),smooth.mean().item(),pos.item()))
    return loss, fake_HSI, smooth.item(), res_loss.item() #, MSI_pred , HSI_pred.mean().reshape(1)

def Cal_discriminatorloss(self,MSI,HSI,rank):
    real_label1 = 1.0
    fake_label1 = 0.0
    one = torch.FloatTensor([1]).to(MSI.device)
    mone = one * -1
    mone.to(one.device)
    b_size = HSI.size(0)

    # MSI --> Resp
    resp_msi =  self.respG(MSI)

    # MSI + resp --> HSI
    fake_HSI,_ = self.hsiG(MSI,resp_msi)

    [B,C,H,W] = fake_HSI.shape
    length = int(B*W*H)
    real_label = torch.full((B,), real_label1, dtype=torch.float, device=HSI.device)
    fake_label = torch.full((B,), fake_label1, dtype=torch.float, device=HSI.device)
    real_label1 = torch.full((length,), real_label1, dtype=torch.float, device=HSI.device)
    fake_label1 = torch.full((length,), fake_label1, dtype=torch.float, device=HSI.device)

    # # domain loss(2) HSI loss:
    # max_value = torch.amax(fake_HSI,dim=[1,2,3],keepdim=True)
    # print('shape of the fakeHSI:{}'.format(fake_HSI.shape))
    mean_vlaue = torch.mean(torch.abs(fake_HSI),dim=[1,2,3],keepdim=True)
    mean_vlaue1 = mean_vlaue.detach()+1e-6
    fake_HSI_pred = self.dishsi(self.gradnorm(fake_HSI, torch.ones(1, device = fake_HSI.device))/mean_vlaue1)
    fake_HSI_pred = torch.squeeze(fake_HSI_pred)
    fake_HSI_loss = self.criterion(fake_HSI_pred, fake_label)

    # max_value = torch.amax(HSI,dim=[1,2,3],keepdim=True)
    mean_vlaue = torch.mean(HSI, dim=[1, 2, 3], keepdim=True)
    mean_vlaue2= mean_vlaue.detach()+1e-6
    # print('shape of the real:{}'.format(HSI.shape))
    real_HSI_pred = self.dishsi(self.gradnorm(HSI, torch.ones(1, device=HSI.device))/mean_vlaue2)
    real_HSI_pred = torch.squeeze(real_HSI_pred)
    real_HSI_loss = self.criterion(real_HSI_pred, real_label)

    # GP_loss = self.calc_gradient_penalty(self.dishsi, HSI.detach()/mean_vlaue2, fake_HSI.detach()/mean_vlaue1, center=0, alpha=None, LAMBDA=10, device=real_HSI_pred.device)
    GP_loss = fake_HSI_loss*0

    # GP_loss = 0
    # loss = fake_HSI_loss + real_HSI_loss #+ GP_loss  # 改动了
    loss = fake_HSI_loss + real_HSI_loss + GP_loss
    # loss = loss_MSI + fake_MSI_loss + real_MSI_loss + fake_HSIU_pred + HSIU_pred + loss_HSIU + fake_HSI_loss + real_HSI_loss
    self.iter +=1
    if self.iter %20 ==0:
        if rank == 0:
            # print('[iter: {}/106][Dis loss:{:.4f}], [HSI  fake loss:{:.2f}, real loss:{:.2f}, GP loss:{:.2f}]'.format(self.iter%106,loss.item(),fake_HSI_loss.item(),real_HSI_loss.item(),GP_loss.item()))
            logging.info('[iter: {}/371][Dis loss:{:.4f}], [HSI  fake loss:{:.2f}, real loss:{:.2f}, GP loss:{:.2f}]'.format(self.iter%371,loss.item(),fake_HSI_loss.item(),real_HSI_loss.item(),GP_loss.item()))
            # print('[iter: {}/106][Dis loss:{:.4f}], [HSI  fake loss:{:.2f}, real loss:{:.2f}]'.format(self.iter%106,loss.item(),fake_HSI_loss.item(),real_HSI_loss.item()))
    return loss, GP_loss   # 改动

def update_generator(self,MSI,HSI,img,rank, seg_label):
    MSI1 = torch.clone(MSI)   # 复制一份
    # img = self.MSI2img(MSI)
    [B,C,H,W] = HSI.shape     # 调整大小与HSI一致

    # spectral feature generation
    # self.gen_optimizer.zero_grad()    # 梯度清零
    MSI1 = torch.nn.functional.interpolate(MSI,size=(int(H),int(W)))   # 调整张量尺寸
    gen_loss, fake_HSI, smooth, res_loss = self.Cal_generatorloss(MSI1,HSI,rank)      # 获取相应损失值:生成器损失,生成的HSI,平滑损失和残差损失

    # spatial feature generation
    with torch.no_grad():     # 不需要计算梯度  这里是用于计算空间特征的
        _,fea = self.HRNet(img)    #  这里是用于计算HRNet的特征,返回值为输出特征,并赋值给fea
    [B,C,H,W] = fea.shape       # 获取fea特征的大小  这里是语义分割RGB图像

    # spectral feature generation   这里的gradnorm计算梯度的范数  这里的大小是和fea特征的大小一致
    fake_HSI = torch.nn.functional.interpolate(self.gradnorm(fake_HSI, torch.ones(1,device = fake_HSI.device)*2),size=(int(H),int(W)))

    pred = self.extractor(Afea = fea, Efea = fake_HSI)    # 这里是传入了鉴别器中,获取一个经过卷积操作的,获得一个中间特征

    loss = self.criterion_class(pred, seg_label)   #  这个损失是与标签之间的损失  这个可以理解为语义标签的

    #  每训练400iter  都进行保存相应训练的mat文件
    if self.iter% 400 == 0:
        scio.savemat('./savefile/train_iter{}.mat'.format(self.iter),{'RGB':MSI.detach().cpu().numpy(),
                                                                    'GenHSI':fake_HSI.detach().cpu().numpy(),
                                                                    'HSI':HSI.detach().cpu().numpy()})

    return gen_loss + loss, smooth, res_loss, gen_loss

def update_discriminator(self,MSI,HSI,rank):

    # self.dis_optimizer.zero_grad()   # 梯度置为0
    MSI = torch.nn.functional.interpolate(MSI, size=(int(128),int(128)))  # 对msi进行插值,大小调整为(128,128)
    dis_loss, GP_loss = self.Cal_discriminatorloss(MSI,HSI,rank)  # 计算判别器损失和梯度惩罚损失
    # dis_loss.backward()    # 反向传播判别器的损失
    # dis_loss1 = dis_loss.item()   # 记录判别器损失和梯度惩罚损失   改动了 由于要使用cgd

    # self.dis_optimizer.step()
    return dis_loss, GP_loss.item()

`

devzhk commented 1 year ago

你好!上面的代码我看了,现在这个代码各种东西杂糅在一起,很难debug。有没有可能把不必要的东西删掉,找到能够复现错误的最小的代码片段呢?

zr-bee commented 1 year ago

哥,这个优化器定义,是正确的不?我尝试一下把不必要的东西进行删除

zr-bee commented 1 year ago

你好!上面的代码我看了,现在这个代码各种东西杂糅在一起,很难debug。有没有可能把不必要的东西删掉,找到能够复现错误的最小的代码片段呢?

哥,可以加您一个微信吗?

devzhk commented 1 year ago

可以啊 微信直接发这里不好,发我邮箱devzhk@gmail.com 吧