Closed daeunni closed 2 years ago
@daeunni If the tensorboard log is properly refreshed and loaded, I think the longer one looks like a 50 epoch schedule, which is correct, while the shorter one looks like something around 12 epoch. You sure they both use the ep50_poly_warmup200
schedule and train.num_epochs=50
?
@voldemortX Thanks a lot for your quick reply! First, the tensorboard log is properly refreshed.
Also, I apologized that I just found some difference in lr_scheduler
config.
Their learning rates are different by 0.2 and 0.01, respectively. (Sky = 0.2 / purple = 0.01)
However, note that the other params are SAME. (like epoch or warmup)
Did just change of learning rates make that situation?
Did just change of learning rates make that situation?
No, lr/dropout should not cause that situation.
Do you have the corresponding *_cfg.json
files in your exp_dir
? They record the exact config parameters for each run.
@voldemortX Yes sure. That problem occurs both in case of TUsimple and CULane configs. I can show my configs in CULane.
Short iteration case
{
"dataset": {
"name": "CULaneAsSegmentation",
"image_set": "train",
"root": "/workspaces\/datasets\/CULane" # my path
},
"train_augmentation": {
"name": "Compose",
"transforms": [
{
"name": "Resize",
"size_image": [
256,
768
],
"size_label": [
256,
768
]
},
{
"name": "RandomRotation",
"degrees": 1
},
{
"name": "ToTensor"
},
{
"name": "Normalize",
"mean": [
0.485,
0.456,
0.406
],
"std": [
0.229,
0.224,
0.225
],
"normalize_target": true
}
]
},
"test_augmentation": {
"name": "Compose",
"transforms": [
{
"name": "Resize",
"size_image": [
256,
768
],
"size_label": [
256,
768
]
},
{
"name": "ToTensor"
},
{
"name": "Normalize",
"mean": [
0.485,
0.456,
0.406
],
"std": [
0.229,
0.224,
0.225
]
}
]
},
"loss": {
"name": "LaneLoss",
"existence_weight": 0.1,
"ignore_index": 255,
"weight": [
0.4,
1,
1,
1,
1
]
},
"optimizer": {
"name": "torch_optimizer",
"torch_optim_class": "SGD",
"lr": 0.01,
"momentum": 0.9,
"weight_decay": 0.0001
},
"lr_scheduler": {
"name": "poly_scheduler_with_warmup",
"epochs": 12,
"power": 0.9,
"warmup_steps": 200
},
"train": {
"exp_name": "ERF_CU_changed_input_size",
"workers": 10,
"batch_size": 80,
"checkpoint": null,
"world_size": 0,
"dist_url": "env:\/\/",
"device": "cuda",
"val_num_steps": 0,
"save_dir": ".\/checkpoints",
"input_size": [
256,
768
],
"original_size": [
590,
1640
],
"num_classes": 5,
"num_epochs": 12,
"collate_fn": null,
"seg": true,
"state": 0,
"mixed_precision": true,
"distributed": false,
"validation": false,
"exp_dir": ".\/checkpoints\/ERF_CU_changed_input_size"
},
"test": {
"exp_name": "ERF_CU_changed_input_size",
"workers": 10,
"batch_size": 80,
"checkpoint": ".\/checkpoints\/erfnet_baseline_culane\/model.pt",
"device": "cuda",
"save_dir": ".\/checkpoints",
"seg": true,
"gap": 20,
"ppl": 18,
"thresh": 0.3,
"collate_fn": null,
"input_size": [
256,
768
],
"original_size": [
590,
1640
],
"max_lane": 4,
"dataset_name": "culane"
},
"model": {
"name": "ERFNet",
"num_classes": 5,
"dropout_1": 0.1,
"dropout_2": 0.1,
"pretrained_weights": "erfnet_encoder_pretrained.pth.tar",
"lane_classifier_cfg": {
"name": "EDLaneExist",
"num_output": 4,
"flattened_size": 3840,
"dropout": 0.3,
"pool": "max"
}
}
}
Long iteration case
{
"dataset": {
"name": "CULaneAsSegmentation",
"image_set": "train",
"root": "/workspaces\/datasets\/CULane"
},
"train_augmentation": {
"name": "Compose",
"transforms": [
{
"name": "Resize",
"size_image": [
256,
768
],
"size_label": [
256,
768
]
},
{
"name": "RandomRotation",
"degrees": 3
},
{
"name": "ToTensor"
},
{
"name": "Normalize",
"mean": [
0.485,
0.456,
0.406
],
"std": [
0.229,
0.224,
0.225
],
"normalize_target": true
}
]
},
"test_augmentation": {
"name": "Compose",
"transforms": [
{
"name": "Resize",
"size_image": [
256,
768
],
"size_label": [
256,
768
]
},
{
"name": "ToTensor"
},
{
"name": "Normalize",
"mean": [
0.485,
0.456,
0.406
],
"std": [
0.229,
0.224,
0.225
]
}
]
},
"loss": {
"name": "LaneLoss",
"existence_weight": 0.1,
"ignore_index": 255,
"weight": [
0.4,
1,
1,
1,
1
]
},
"optimizer": {
"name": "torch_optimizer",
"torch_optim_class": "SGD",
"lr": 0.2,
"momentum": 0.9,
"weight_decay": 0.0001
},
"lr_scheduler": {
"name": "poly_scheduler_with_warmup",
"epochs": 12,
"power": 0.9,
"warmup_steps": 200
},
"model": {
"name": "ERFNet",
"num_classes": 5,
"dropout_1": 0.1,
"dropout_2": 0.1,
"pretrained_weights": "erfnet_encoder_pretrained.pth.tar",
"lane_classifier_cfg": {
"name": "EDLaneExist",
"num_output": 4,
"flattened_size": 3840,
"dropout": 0.3,
"pool": "max"
}
},
"train": {
"exp_name": "ERF_CU_changed_input_size_froze_others",
"explain": "Just test for debugging",
"workers": 10,
"batch_size": 20,
"checkpoint": null,
"world_size": 0,
"dist_url": "env:\/\/",
"device": "cuda",
"val_num_steps": 0,
"save_dir": ".\/checkpoints",
"input_size": [
256,
768
],
"original_size": [
590,
1640
],
"num_classes": 5,
"num_epochs": 12,
"collate_fn": null,
"seg": true,
"state": 0,
"mixed_precision": false,
"distributed": false,
"validation": false,
"exp_dir": ".\/checkpoints\/ERF_CU_changed_input_size_froze_others"
},
"test": {
"exp_name": "ERF_CU_changed_input_size_froze_others",
"workers": 10,
"batch_size": 20,
"checkpoint": ".\/checkpoints\/ERF_CU_changed_input_size_froze_others\/model.pt",
"device": "cuda",
"save_dir": ".\/checkpoints",
"seg": true,
"gap": 20,
"ppl": 18,
"thresh": 0.3,
"collate_fn": null,
"input_size": [
256,
768
],
"original_size": [
590,
1640
],
"max_lane": 4,
"dataset_name": "culane"
}
}
Do you have an idea? In addition, I changed my model's input size for my usage.
@daeunni These configs seem to have the same training epochs, if this is CULane, 88k images, then #iteration should be around 4.4k * 12 = 53k. I'm guessing the tensorboard snapshots are from TuSimple where 50epoch-8k is correct? Could you provide the tusimple configs (python configs) for me to try reproduce the problem (tusimple runs faster)?
@voldemortX Thank you for your quick reply! And this is a CULane tensorboard snapshot(The above blue & purple tensorboard is from TUSimple. You are right!) Is it okay with this snapshot and above CULane's configs?
@daeunni Yes these should also be ok. The longer one looks correct, so I'll just run master branch code for the shorter one.
If it can be reproduced, this could be some bug.
@voldemortX Thanks a lot! I'll run again that cases too.
@daeunni Just when I was copying your config, I think I found the issue. The shorter one has a train.batch_size=80
, which is exactly 4 times the longer one.
Oh, It might be right! Thank you for your help :)
Hi, thank a lot for your nice work! btw, I found that when I train ERFNet for Lane Detection(using
erfnet_baseline_culane.sh
script), the model's iteration number is changed every time it trained.The below is my tensorboard and in the two models below, only the dropout rate of the model is 0.1 and 0.3, which is different. (The other train & model parameters in config file is SAME) But their iteration numbers are quite different.
What makes this difference between them in the process of training? (I think the dropout rate can't effect that results)