kaylode / theseus

General template for most Pytorch projects
MIT License
34 stars 6 forks source link

Classification losses error #26

Closed lannguyen0910 closed 2 years ago

lannguyen0910 commented 2 years ago

I encountered these errors when testing with SmoothCELoss and FocalLoss, the logs are below:

  1. Focal loss
    
    [Errno 2] No such file or directory: 'main'
    /content/main
    2022-03-27 11:07:21 | DEBUG    | stdout_logger.py:log_text:34 - Overriding configuration...
    2022-03-27 11:07:21 | INFO     | stdout_logger.py:log_text:28 - {
    "global": {
        "debug": true,
        "cfg_transform": "configs/classification/transform.yaml",
        "save_dir": "/content/main/runs",
        "device": "cuda:0",
        "use_fp16": true,
        "pretrained": null,
        "resume": null
    },
    "trainer": {
        "name": "SupervisedTrainer",
        "args": {
            "num_iterations": 3000,
            "clip_grad": 10.0,
            "evaluate_interval": 1,
            "print_interval": 20,
            "save_interval": 500
        }
    },
    "model": {
        "name": "BaseTimmModel",
        "args": {
            "name": "convnext_small",
            "from_pretrained": true,
            "num_classes": 180
        }
    },
    "loss": {
        "name": "FocalLoss"
    },
    "callbacks": [
        {
            "name": "LoggerCallbacks",
            "args": null
        },
        {
            "name": "CheckpointCallbacks",
            "args": {
                "best_key": "bl_acc"
            }
        },
        {
            "name": "VisualizerCallbacks",
            "args": null
        },
        {
            "name": "TensorboardCallbacks",
            "args": null
        }
    ],
    "metrics": [
        {
            "name": "Accuracy",
            "args": null
        },
        {
            "name": "BalancedAccuracyMetric",
            "args": null
        },
        {
            "name": "F1ScoreMetric",
            "args": {
                "average": "weighted"
            }
        },
        {
            "name": "ConfusionMatrix",
            "args": null
        },
        {
            "name": "ErrorCases",
            "args": null
        }
    ],
    "optimizer": {
        "name": "AdamW",
        "args": {
            "lr": 0.001,
            "weight_decay": 0.0005,
            "betas": [
                0.937,
                0.999
            ]
        }
    },
    "scheduler": {
        "name": "SchedulerWrapper",
        "args": {
            "scheduler_name": "cosine2",
            "t_initial": 7,
            "t_mul": 0.9,
            "eta_mul": 0.9,
            "eta_min": 1e-06
        }
    },
    "data": {
        "dataset": {
            "train": {
                "name": "ImageFolderDataset",
                "args": {
                    "image_dir": "/content/main/data/food-classification/train",
                    "txt_classnames": "configs/classification/classes.txt"
                }
            },
            "val": {
                "name": "ImageFolderDataset",
                "args": {
                    "image_dir": "/content/main/data/food-classification/val",
                    "txt_classnames": "configs/classification/classes.txt"
                }
            }
        },
        "dataloader": {
            "train": {
                "name": "DataLoaderWithCollator",
                "args": {
                    "batch_size": 32,
                    "drop_last": true,
                    "shuffle": false,
                    "collate_fn": {
                        "name": "MixupCutmixCollator",
                        "args": {
                            "mixup_alpha": 0.4,
                            "cutmix_alpha": 1.0,
                            "weight": [
                                0.2,
                                0.2
                            ]
                        }
                    },
                    "sampler": {
                        "name": "BalanceSampler",
                        "args": null
                    }
                }
            },
            "val": {
                "name": "DataLoaderWithCollator",
                "args": {
                    "batch_size": 32,
                    "drop_last": false,
                    "shuffle": true
                }
            }
        }
    }
    }
    2022-03-27 11:07:21 | DEBUG    | stdout_logger.py:log_text:34 - Loading config from configs/classification/transform.yaml...
    2022-03-27 11:07:21 | DEBUG    | stdout_logger.py:log_text:34 - Calculating class distribution...
    Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth" to /root/.cache/torch/hub/checkpoints/convnext_small_1k_224_ema.pth
    2022-03-27 11:07:46 | INFO     | stdout_logger.py:log_text:28 - Number of trainable parameters: 49,593,108
    2022-03-27 11:07:46 | INFO     | stdout_logger.py:log_text:28 - Using CUDA:0 (Tesla T4, 15109.75MB)

2022-03-27 11:07:46 | INFO | stdout_logger.py:log_text:28 - Number of training samples: 88814 2022-03-27 11:07:46 | INFO | stdout_logger.py:log_text:28 - Number of validation samples: 21775 2022-03-27 11:07:46 | INFO | stdout_logger.py:log_text:28 - Number of training iterations each epoch: 2775 2022-03-27 11:07:46 | INFO | stdout_logger.py:log_text:28 - Number of validation iterations each epoch: 681 2022-03-27 11:07:46 | INFO | stdout_logger.py:log_text:28 - Everything will be saved to /content/main/runs/2022-03-27_11-07-21 2022-03-27 11:07:46 | DEBUG | stdout_logger.py:log_text:34 - Saving config to /content/main/runs/2022-03-27_11-07-21/pipeline.yaml... 2022-03-27 11:07:46 | DEBUG | stdout_logger.py:log_text:34 - Saving config to /content/main/runs/2022-03-27_11-07-21/transform.yaml... 2022-03-27 11:07:46 | DEBUG | stdout_logger.py:log_text:34 - Start sanity checks 2022-03-27 11:07:47 | DEBUG | stdout_logger.py:log_text:34 - Visualizing architecture... 2022-03-27 11:07:50 | INFO | stdout_logger.py:log_text:28 - =============================EVALUATION=================================== 100% 681/681 [04:04<00:00, 2.78it/s] 2022-03-27 11:11:56 | INFO | stdout_logger.py:log_text:28 - [0|3000] || L: 0.13242 || Time: 2.7617 (it/s) 2022-03-27 11:11:56 | INFO | stdout_logger.py:log_text:28 - acc: 0.00455 | bl_acc: 0.00411 | weighted-f1: 0.00332 |

2022-03-27 11:11:56 | INFO | stdout_logger.py:log_text:28 - ========================================================================== 2022-03-27 11:11:57 | DEBUG | stdout_logger.py:log_text:34 - Visualizing model predictions... 2022-03-27 11:11:59 | DEBUG | stdout_logger.py:log_text:34 - Visualizing dataset... 2022-03-27 11:12:01 | DEBUG | stdout_logger.py:log_text:34 - Analyzing datasets... 100% 88814/88814 [12:01<00:00, 123.05it/s] 100% 21775/21775 [02:12<00:00, 163.82it/s] 2022-03-27 11:26:17 | INFO | stdout_logger.py:log_text:28 - ===========================START TRAINING================================= Traceback (most recent call last): File "/content/main/configs/classification/train.py", line 10, in train_pipeline.fit() File "/content/main/theseus/classification/pipeline.py", line 171, in fit self.trainer.fit() File "/content/main/theseus/base/trainer/base_trainer.py", line 65, in fit self.training_epoch() File "/content/main/theseus/base/trainer/supervised_trainer.py", line 68, in training_epoch outputs = self.model.training_step(batch) File "/content/main/theseus/classification/models/wrapper.py", line 34, in training_step return self.forward(batch) File "/content/main/theseus/classification/models/wrapper.py", line 22, in forward loss, loss_dict = self.criterion(outputs, batch, self.device) File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/content/main/theseus/classification/losses/focal_loss.py", line 21, in forward targets = nn.functional.one_hot(targets, num_classes=num_classes) RuntimeError: one_hot is only applicable to index tensor.


2. ```SmoothCELoss```
```python
[Errno 2] No such file or directory: 'main'
/content/main
2022-03-27 11:48:37 | DEBUG    | stdout_logger.py:log_text:34 - Overriding configuration...
2022-03-27 11:48:37 | INFO     | stdout_logger.py:log_text:28 - {
    "global": {
        "debug": true,
        "cfg_transform": "configs/classification/transform.yaml",
        "save_dir": "/content/main/runs",
        "device": "cuda:0",
        "use_fp16": true,
        "pretrained": null,
        "resume": null
    },
    "trainer": {
        "name": "SupervisedTrainer",
        "args": {
            "num_iterations": 3000,
            "clip_grad": 10.0,
            "evaluate_interval": 1,
            "print_interval": 20,
            "save_interval": 500
        }
    },
    "model": {
        "name": "BaseTimmModel",
        "args": {
            "name": "convnext_small",
            "from_pretrained": true,
            "num_classes": 180
        }
    },
    "loss": {
        "name": "SmoothCELoss"
    },
    "callbacks": [
        {
            "name": "LoggerCallbacks",
            "args": null
        },
        {
            "name": "CheckpointCallbacks",
            "args": {
                "best_key": "bl_acc"
            }
        },
        {
            "name": "VisualizerCallbacks",
            "args": null
        },
        {
            "name": "TensorboardCallbacks",
            "args": null
        }
    ],
    "metrics": [
        {
            "name": "Accuracy",
            "args": null
        },
        {
            "name": "BalancedAccuracyMetric",
            "args": null
        },
        {
            "name": "F1ScoreMetric",
            "args": {
                "average": "weighted"
            }
        },
        {
            "name": "ConfusionMatrix",
            "args": null
        },
        {
            "name": "ErrorCases",
            "args": null
        }
    ],
    "optimizer": {
        "name": "AdamW",
        "args": {
            "lr": 0.001,
            "weight_decay": 0.0005,
            "betas": [
                0.937,
                0.999
            ]
        }
    },
    "scheduler": {
        "name": "SchedulerWrapper",
        "args": {
            "scheduler_name": "cosine2",
            "t_initial": 7,
            "t_mul": 0.9,
            "eta_mul": 0.9,
            "eta_min": 1e-06
        }
    },
    "data": {
        "dataset": {
            "train": {
                "name": "ImageFolderDataset",
                "args": {
                    "image_dir": "/content/main/data/food-classification/train",
                    "txt_classnames": "configs/classification/classes.txt"
                }
            },
            "val": {
                "name": "ImageFolderDataset",
                "args": {
                    "image_dir": "/content/main/data/food-classification/val",
                    "txt_classnames": "configs/classification/classes.txt"
                }
            }
        },
        "dataloader": {
            "train": {
                "name": "DataLoaderWithCollator",
                "args": {
                    "batch_size": 32,
                    "drop_last": true,
                    "shuffle": false,
                    "collate_fn": {
                        "name": "MixupCutmixCollator",
                        "args": {
                            "mixup_alpha": 0.4,
                            "cutmix_alpha": 1.0,
                            "weight": [
                                0.2,
                                0.2
                            ]
                        }
                    },
                    "sampler": {
                        "name": "BalanceSampler",
                        "args": null
                    }
                }
            },
            "val": {
                "name": "DataLoaderWithCollator",
                "args": {
                    "batch_size": 32,
                    "drop_last": false,
                    "shuffle": true
                }
            }
        }
    }
}
2022-03-27 11:48:37 | DEBUG    | stdout_logger.py:log_text:34 - Loading config from configs/classification/transform.yaml...
2022-03-27 11:48:37 | DEBUG    | stdout_logger.py:log_text:34 - Calculating class distribution...
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Number of trainable parameters: 49,593,108
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Using CUDA:0 (Tesla T4, 15109.75MB)

2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Number of training samples: 88814
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Number of validation samples: 21775
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Number of training iterations each epoch: 2775
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Number of validation iterations each epoch: 681
2022-03-27 11:48:43 | INFO     | stdout_logger.py:log_text:28 - Everything will be saved to /content/main/runs/2022-03-27_11-48-37
2022-03-27 11:48:43 | DEBUG    | stdout_logger.py:log_text:34 - Saving config to /content/main/runs/2022-03-27_11-48-37/pipeline.yaml...
2022-03-27 11:48:43 | DEBUG    | stdout_logger.py:log_text:34 - Saving config to /content/main/runs/2022-03-27_11-48-37/transform.yaml...
2022-03-27 11:48:43 | DEBUG    | stdout_logger.py:log_text:34 - Start sanity checks
2022-03-27 11:48:44 | DEBUG    | stdout_logger.py:log_text:34 - Visualizing architecture...
2022-03-27 11:48:47 | INFO     | stdout_logger.py:log_text:28 - =============================EVALUATION===================================
100% 681/681 [04:04<00:00,  2.78it/s]
2022-03-27 11:52:53 | INFO     | stdout_logger.py:log_text:28 - [0|3000] || CE: 5.19444 || Time:     2.7645 (it/s)
2022-03-27 11:52:53 | INFO     | stdout_logger.py:log_text:28 - acc: 0.00822 | bl_acc: 0.00766 | weighted-f1: 0.00479 | 

2022-03-27 11:52:53 | INFO     | stdout_logger.py:log_text:28 - ==========================================================================
2022-03-27 11:52:54 | DEBUG    | stdout_logger.py:log_text:34 - Visualizing model predictions...
2022-03-27 11:52:56 | DEBUG    | stdout_logger.py:log_text:34 - Visualizing dataset...
2022-03-27 11:52:58 | DEBUG    | stdout_logger.py:log_text:34 - Analyzing datasets...
100% 88814/88814 [12:02<00:00, 122.99it/s]
100% 21775/21775 [02:13<00:00, 163.64it/s]
2022-03-27 12:07:15 | INFO     | stdout_logger.py:log_text:28 - ===========================START TRAINING=================================
Traceback (most recent call last):
  File "/content/main/configs/classification/train.py", line 10, in <module>
    train_pipeline.fit()
  File "/content/main/theseus/classification/pipeline.py", line 171, in fit
    self.trainer.fit()
  File "/content/main/theseus/base/trainer/base_trainer.py", line 65, in fit
    self.training_epoch()
  File "/content/main/theseus/base/trainer/supervised_trainer.py", line 68, in training_epoch
    outputs = self.model.training_step(batch)
  File "/content/main/theseus/classification/models/wrapper.py", line 34, in training_step
    return self.forward(batch)
  File "/content/main/theseus/classification/models/wrapper.py", line 22, in forward
    loss, loss_dict = self.criterion(outputs, batch, self.device)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/content/main/theseus/classification/losses/ce_loss.py", line 37, in forward
    loss = self.criterion(pred, target.view(-1).contiguous())
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/timm/loss/cross_entropy.py", line 22, in forward
    nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
RuntimeError: gather(): Expected dtype int64 for index

I guess it's an error from Mixup Cutmix collator, something with torch.int64.

Here's the link to notebook that i've used for testing, if you want to have a look at: notebook

kaylode commented 2 years ago

ok, I've analyzed this error. It is because MixupCutmix occasionally transforms the targets tensor into soft one-hot encoding , whereas the current FocalLoss and SmoothCE requires argmax labels.

For FocalLoss it can be easily fixed by this:


class FocalLoss(nn.Module):
    def forward(self, outputs: Dict[str, Any], batch: Dict[str, Any], device: torch.device):
        outputs = outputs['outputs']
        targets = move_to(batch['targets'], device)
        num_classes = outputs.shape[-1]

        # Need to be one hot encoding
        if outputs.shape != targets.shape:
            targets = nn.functional.one_hot(targets, num_classes=num_classes)
            targets = targets.float().squeeze()

        loss = sigmoid_focal_loss(outputs, targets, self.alpha, self.gamma, self.reduction)
        loss_dict = {"L": loss.item()}
        return loss, loss_dict

For SmoothCE, timm has SoftTargetCrossEntropy class which is suitable for soft one-hot encoding:

from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy

class SmoothCELoss(nn.Module):
    def __init__(self, smoothing: float=0.1, **kwargs):
        super(SmoothCELoss, self).__init__(**kwargs)
        self.smooth_criterion = LabelSmoothingCrossEntropy()
        self.soft_criterion = SoftTargetCrossEntropy()

    def forward(self, outputs: Dict[str, Any], batch: Dict[str, Any], device: torch.device):
        pred = outputs['outputs']
        target = move_to(batch["targets"], device)

        if pred.shape == target.shape:
            loss = self.soft_criterion(pred, target)
        else:
            loss = self.smooth_criterion(pred, target.view(-1).contiguous())
        loss_dict = {"CE": loss.item()}
        return loss, loss_dict

I will make a PR and you can test it out