dr-aheydari / SoftAdapt

Implementation of the SoftAdapt paper (techniques for adaptive loss balancing of multi-tasking neural networks)
MIT License
25 stars 5 forks source link

OverflowError: int too large to convert to float #7

Open myluckytimes opened 5 months ago

myluckytimes commented 5 months ago
def train(loader, model, optimizer, total_epochs):
    model.train()
    global bce_loss_lst, iou_loss_lst, ssim_loss_lst, adapt_weights
    step = 0
    for current_epoch in range(total_epochs):
        pbar = tqdm(loader, desc=f"Epoch {current_epoch + 1}/{total_epochs}", unit="batch")
        for images, gts in pbar:
            optimizer.zero_grad()
            images = images.to(device)
            gts = gts.to(device)

            d0, d1, d2, d3, d4, d5, d6, d7 = model(images)
            outputs = [d7, d6, d5, d4, d3, d2, d1, d0]

            total_bce_loss = 0
            total_iou_loss = 0
            total_ssim_loss = 0

            for output in outputs:
                bce_loss_value = bce_loss(output, gts)
                iou_loss_value = iou_loss(output, gts)
                ssim_loss_value = ssim_loss(output, gts)

                total_bce_loss += bce_loss_value
                total_iou_loss += iou_loss_value
                total_ssim_loss += ssim_loss_value

            # 记录每个 epoch 的损失
            bce_loss_lst.append(total_bce_loss.item())
            iou_loss_lst.append(total_iou_loss.item())
            ssim_loss_lst.append(total_ssim_loss.item())

            # 确保损失点数大于等于 accuracy_order
            if current_epoch % epochs_to_make_updates == 0 and current_epoch != 0:
                adapt_weights = softadapt.get_component_weights(
                    torch.tensor(bce_loss_lst),
                    torch.tensor(iou_loss_lst),
                    torch.tensor(ssim_loss_lst),
                    verbose=False
                ).to(device)

                # # 重置损失列表
                bce_loss_lst = []
                iou_loss_lst = []
                ssim_loss_lst = []

            # 线性组合损失
            total_loss = (adapt_weights[0] * total_bce_loss +
                          adapt_weights[1] * total_iou_loss +
                          adapt_weights[2] * total_ssim_loss)

            wandb.log({
                "bce_weight": adapt_weights[0],
                "iou_weight": adapt_weights[1],
                "ssim_weight": adapt_weights[2],
                "loss": total_loss
            })

            total_loss.backward()
            optimizer.step()
            pbar.update(1)
            pbar.set_postfix(Loss=total_loss.item())
            step += 1

        save_path = './saved_models/'
        os.makedirs(save_path, exist_ok=True)
        if (current_epoch + 1) % 5 == 0:
            torch.save(model.state_dict(), save_path + f'{opt.dataset}_BASNet_softadapt2.pth.{current_epoch + 1}')

Traceback (most recent call last): File "/home/lsa/Shared/Code/BASNet/train_softadapt_1.py", line 144, in train(train_loader, model, optimizer, opt.epoch) File "/home/lsa/Shared/Code/BASNet/train_softadapt_1.py", line 92, in train adapt_weights = softadapt.get_component_weights( File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/softadapt/algorithms/loss_weighted_variant.py", line 65, in get_component_weights self._compute_rates_of_change(loss_points, File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/softadapt/base/_softadapt_base_class.py", line 81, in _compute_rates_of_change return _get_finite_difference(input_array = input_tensor.numpy(), File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/softadapt/utilities/_finite_difference.py", line 70, in _get_finite_difference constants = coefficients(deriv=1, acc=order)["forward"]["coefficients"] File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/findiff/coefs.py", line 72, in coefficients ret["center"] = calc_coefs(deriv, offsets, symbolic, analytic_inv) File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/findiff/coefs.py", line 146, in calc_coefs matrix = _build_matrix(offsets, symbolic) File "/home/lsa/.conda/envs/py310_torch222/lib/python3.10/site-packages/findiff/coefs.py", line 237, in _build_matrix return np.array(A,dtype='float') OverflowError: int too large to convert to float

Could you help me to solve this matter?

madarax64 commented 4 months ago

Hi @dr-aheydari , I have the exact same error with the same call stack in findiff...

viet2411 commented 3 weeks ago

H @dr-aheydari @myluckytimes, Has anyone find a way to fix this error yet?