Closed Njuod closed 4 years ago
Hi, I think you just need to change the name of the dataloader from ade20k
to ADE20K
Thanks!
I've modified the name and the names of the split and it is work at the beginning, but then I got another error.
The modified code: `{ "name": "SegNet", "n_gpu": 1, "use_synch_bn": true,
"arch": {
"type": "SegNet",
"args": {
"backbone": "resnet50",
"freeze_bn": false,
"freeze_backbone": false
}
},
"train_loader": {
"type": "ADE20K",
"args":{
"data_dir": "..../pytorch_segmentation/ADEChallengeData2016",
"batch_size": 4,
"base_size": 400,
"crop_size": 380,
"augment": true,
"shuffle": true,
"scale": true,
"flip": true,
"rotate": true,
"blur": false,
"split": "training",
"num_workers": 8
}
},
"val_loader": {
"type": "ADE20K",
"args":{
"data_dir": "..../pytorch_segmentation/ADEChallengeData2016",
"batch_size": 2,
"crop_size": 480,
"val": true,
"split": "validation",
"num_workers": 4
}
},
"optimizer": {
"type": "SGD",
"differential_lr": true,
"args":{
"lr": 0.01,
"weight_decay": 1e-4,
"momentum": 0.9
}
},
"loss": "CrossEntropyLoss2d",
"ignore_index": 255,
"lr_scheduler": {
"type": "Poly",
"args": {}
},
"trainer": {
"epochs": 80,
"save_dir": "saved/",
"save_period": 10,
"monitor": "max Mean_IoU",
"early_stop": 10,
"tensorboard": true,
"log_dir": "saved/runs",
"log_per_iter": 20,
"val": true,
"val_per_epochs": 5
}
} `
The result:
`
SegNet( (stage1_encoder): Sequential( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage2_encoder): Sequential( (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage3_encoder): Sequential( (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage4_encoder): Sequential( (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage5_encoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (stage1_decoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage2_decoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage3_decoder): Sequential( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage4_decoder): Sequential( (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage5_decoder): Sequential( (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(64, 150, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (unpool): MaxUnpool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)) ) Nbr of trainable parameters: 16396246
Detected GPUs: 1 Requested: 1
0%| | 0/10105 [00:00<?, ?it/s]C:\Program Files\Python36\lib\site-packages\torch\optim\lr_scheduler.py:122: UserWarning: Detected call of lr_scheduler.step()
before optimizer.step()
. In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step()
before lr_scheduler.step()
. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
Traceback (most recent call last):
File "train.py", line 62, in
`
I have also reduced the batch size and I still get the same error.
Sorry, I am new to PyTorch. So, do you have any idea? Thank you for your help.
Hi, in this case you run out of GPU memory, try to either reduce the batch size or the crop size
Hi yassouali,
Thank you for your help.
I'm trying to create my own dataloader, but I have an issue with the labels and the paletted files. Could you explain these tow folders especially the (paletted.py)?
My dataset has only two classes (background, not-background)
Thanks again!
Hi,
Better to ignore them both, the labels are just the class names for a given ID for the datasets provided, for paletted, it is just the colors for each class for each dataset.
To simplify things, all you need to do is similar to VOC dataloader, use the same palette as in VOC dataset, and only change the number of classes and how you load the image and the labels.
Good luck.
Because my dataset file has not data augmentation and is similar to ADE20K Dataset file, I've loaded the image and the labels in the same way. I've also used the same palette as in the VOC dataset as you suggested. The code looks as follows:
`
class DATADataset(BaseDataSet):
def __init__(self, **kwargs):
self.num_classes = 2
self.palette = palette.get_voc_palette(self.num_classes)
super(DATADataset, self).__init__(**kwargs)
def _set_files(self):
if self.split in ["training", "validation"]:
self.image_dir = os.path.join(self.root, 'images', self.split)
self.label_dir = os.path.join(self.root, 'annotations', self.split)
self.files = [os.path.basename(path).split('.')[0] for path in glob(self.image_dir + '/*.jpg')]
else: raise ValueError(f"Invalid split name {self.split}")
def _load_data(self, index):
image_id = self.files[index]
image_path = os.path.join(self.image_dir, image_id + '.jpg')
label_path = os.path.join(self.label_dir, image_id + '.png')
image = np.asarray(Image.open(image_path).convert('RGB'), dtype=np.float32)
label = np.asarray(Image.open(label_path), dtype=np.int32) - 1 # from -1 to 149
return image, label, image_id
class DATA(BaseDataLoader):
def __init__(self, data_dir, batch_size, split, crop_size=None, base_size=None, scale=True, num_workers=1, val=False,
shuffle=False, flip=False, rotate=False, blur= False, augment=False, val_split= None, return_id=False):
self.MEAN = [0.48897059, 0.46548275, 0.4294]
self.STD = [0.22861765, 0.22948039, 0.24054667]
kwargs = {
'root': data_dir,
'split': split,
'mean': self.MEAN,
'std': self.STD,
'augment': augment,
'crop_size': crop_size,
'base_size': base_size,
'scale': scale,
'flip': flip,
'blur': blur,
'rotate': rotate,
'return_id': return_id,
'val': val
}
self.dataset = DATADataset(**kwargs)
super(DATA, self).__init__(self.dataset, batch_size, shuffle, num_workers, val_split)`
Then, I got the following error:
AttributeError: module 'dataloaders' has no attribute 'DATA'
So, I added from .data import DATA
to __init__.py
Then, I got this error:
RuntimeError: cuda runtime error (710) : device-side assert triggered at C:/w/1/s/windows/pytorch/aten/src/ATen/native/cuda/SoftMax.cu:651
Therefore, I modified the following line code and remove -1
label = np.asarray(Image.open(label_path), dtype=np.int32) # - 1 # from -1 to 149
and I changed the value of "ignore_index" to 255.
Is what I did right?
I'm sorry for bothering you
Here is the beginning of training:
packages\torch\optim\lr_scheduler.py:122: UserWarning: Detected call of
lr_scheduler.step()before
optimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order:
optimizer.step()before
lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
TRAIN (1) | Loss: 0.001 | Acc 0.70 mIoU 0.35 | B 0.60 D 0.02 |: 100%|█████████████████████████| 3063/3063 [30:44<00:00, 1.31it/s]
TRAIN (2) | Loss: 0.000 | Acc 0.70 mIoU 0.35 | B 0.61 D 0.03 |: 100%|█████████████████████████| 3063/3063 [31:04<00:00, 1.20it/s]
TRAIN (3) | Loss: 0.000 | Acc 0.70 mIoU 0.35 | B 0.64 D 0.03 |: 54%|█████████████▍ | 1641/3063 [17:32<14:10, 1.67it/s]`
I do not know what is wrong
Hi, yes what you did is correct, the classes in your case are 0 and 1 with no ignored labels.
What is the current problems, is it the performances, try different models if this is the case.
I am still trying to run your codes on my dataset. But the result is still illogical and I think I need to make some adjustments.
If I have masks that contain only one class, meaning 0 indicates the background value and 255 for the required class value, does this mean that the result will be affected if: "ignore_index": 255,
as the target value will be ignored. I have tried to remove it but I got an error because of that.
Can you explain more about the variable "ignore_index": 255,
?
Also, what is the difference between converting the data type of the labels to float32 or int32?
@Njuod Yes, if you only have two classes, better to use 0 and 1 as your labels.
For certain datasets, there are some regions in the image that were not annotated, for examples, the borders in PASCAL VOC were not annotated, and in the labels, these regions are annotated as 255, so when training the model, we also want to ignore these regions, and not backpropagate the loss from them. This is why ignore_index
is used so that when computing the cross-entropy, we pass the correct ignore label index. So now, if you passed one of your classes as ignore index, you can imaging that the model will be random in these regions, given that it was never trained on these classes.
Hi, I've been trying to run the config file for SegNet on ADE20K as a try too. However, I've got the same error, and the config.json is as the same as @Njuod . And I have modified the following line code and remove -1 label = np.asarray(Image.open(label_path), dtype=np.int32) # - 1 # from -1 to 149 I saw the ignore_index is 255 already, so I didn't replace it. But still get the following error
/home/muyun/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:82: UserWarning: Detected call of lr_scheduler.step()
before optimizer.step()
. In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step()
before lr_scheduler.step()
. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
TRAIN (1) | Loss: 5.406 | Acc 0.03 mIoU 0.00 | B 2.85 D 1.39 |: 0%| | 5/5053 [00:14<4:42:06, 3.35s/it]/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T , int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [160,0,0] Assertion t >= 0 && t < n_classes
failed.
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T , int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [161,0,0] Assertion t >= 0 && t < n_classes
failed.
....
...
...
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [159,0,0] Assertion t >= 0 && t < n_classes
failed.
THCudaCheck FAIL file=/pytorch/aten/src/THC/THCCachingHostAllocator.cpp line=296 error=59 : device-side assert triggered
Traceback (most recent call last):
File "train.py", line 61, in
could U help me plz.
Hi, I ran the UNet on a custom Dataset with its own Dataset and Dataloader (adapted from VOC) and got the above error, too. Is there any solution yet? Thanks for helping!
@Murmeltier105 seems not...
For ADE, the ignored class id -1 instead of 255, this is why threre is a -1, so that the unlabeled regions of class 0 becomes -1, all you need to do is set ignore index to -1 in the config file and it should work, but keep -1 in the code.
Hi, I ran the UNet on a custom Dataset with its own Dataset and Dataloader (adapted from VOC) and got the above error, too. Is there any solution yet? Thanks for helping!
Hi, sorry for the late reply, can you post the code with some info / exemples about the form of your data
The dataloader is at: https://github.com/Murmeltier105/CRC_segmentation/blob/master/dataloaders/crc.py
It's added to the dataloaders/init.py as well.
For the config I used: https://github.com/Murmeltier105/CRC_segmentation/blob/master/config.json
I got Filelists for train,val,test with content like: TO5849297(1,00,37500,15500,500,500)
with everything before the Brace is the folder of a tiled image with tile size 500*500. Images (jpg) and labels (png) are in the same folder.
The error output looks like: `/home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint8 = np.dtype([("qint8", np.int8, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_quint8 = np.dtype([("quint8", np.uint8, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint16 = np.dtype([("qint16", np.int16, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_quint16 = np.dtype([("quint16", np.uint16, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint32 = np.dtype([("qint32", np.int32, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. np_resource = np.dtype([("resource", np.ubyte, 1)]) Detected GPUs: 1 Requested: 1
TRAIN (1) | Loss: 1.747 | Acc 0.30 mIoU 0.10 | B 0.31 D 0.06 |: 1%|▏ | 22/3458 [00:06<08:10, 7.00it/s]/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [128,0,0] Assertion t >= 0 && t < n_classes
failed.
[...]
/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [895,0,0] Assertion t >= 0 && t < n_classes
failed.
Traceback (most recent call last):
File "/home/dr1/Dokumente/pytorch_segmentation/train.py", line 61, in
Do I have to use the ignore_index of the loss in the config file? Because up to now there is no class I want to leave out.
Thanks for your help!
Thanks for the code, the problem is that for one of your labels, the class ID is > 5 or < 0 (because you have 6 classes), what you can do is add an assert to find the root of the problem, then you can find the value of the id class that causes the error and add it as an ignore class.
add this after label = np.asarray(Image.open(label_path), dtype=np.int32)
:
assert img.max() < 6 and img.min() >= 0, f"the ID cause problem is one of {img.max()}, {img.min()} of image {image_id}"
Thank you so much for your help! It works. label.max was at 6. Will there always be an excess label, or rather where does is come from?
This specific to your dataset, maybe you have 7 classes and not 6.
Hi,
could you please let me know on which dataset the SegNet was trained. The issue is whatever the dataset I used to train the model, the mean iou for validation dataset becomes 0.
I have Also trained the SegNet on the SUN-RGB dataset which mentioned in their own paper but I got the following result:
` TRAIN (46) | Loss: 0.752 | Acc 0.76 mIoU 0.40 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.90it/s]
TRAIN (47) | Loss: 0.736 | Acc 0.76 mIoU 0.41 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (48) | Loss: 0.718 | Acc 0.77 mIoU 0.42 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (49) | Loss: 0.695 | Acc 0.78 mIoU 0.44 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (50) | Loss: 0.681 | Acc 0.78 mIoU 0.45 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
EVAL (50) | Loss: 9.292, PixelAcc: 0.18, Mean IoU: 0.00 |: 100%|██████████████████████████████| 1263/1263 [04:12<00:00, 5.00it/s]
## Info for epoch 50 ##
val_loss : 9.29229
Pixel_Accuracy : 0.184
Mean_IoU : 0.004999999888241291
Class_IoU : {0: 0.002, 1: 0.187, 2: 0.001, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.005, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}
Saving a checkpoint: saved/SegNet/05-12_04-36/checkpoint-epoch50.pth ...
TRAIN (51) | Loss: 0.650 | Acc 0.79 mIoU 0.47 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (52) | Loss: 0.633 | Acc 0.79 mIoU 0.48 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (53) | Loss: 0.623 | Acc 0.80 mIoU 0.49 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (54) | Loss: 0.607 | Acc 0.80 mIoU 0.50 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (55) | Loss: 0.591 | Acc 0.81 mIoU 0.51 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
EVAL (55) | Loss: 10.134, PixelAcc: 0.19, Mean IoU: 0.00 |: 100%|█████████████████████████████| 1263/1263 [04:12<00:00, 5.00it/s]
## Info for epoch 55 ##
val_loss : 10.13353
Pixel_Accuracy : 0.187
Mean_IoU : 0.004999999888241291
Class_IoU : {0: 0.003, 1: 0.187, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}
TRAIN (56) | Loss: 0.562 | Acc 0.82 mIoU 0.54 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (57) | Loss: 0.559 | Acc 0.82 mIoU 0.54 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (58) | Loss: 0.546 | Acc 0.82 mIoU 0.55 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (59) | Loss: 0.524 | Acc 0.83 mIoU 0.56 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]
TRAIN (60) | Loss: 0.510 | Acc 0.83 mIoU 0.57 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:42<00:00, 3.86it/s]^[OP
EVAL (60) | Loss: 10.961, PixelAcc: 0.19, Mean IoU: 0.00 |: 100%|█████████████████████████████| 1263/1263 [04:19<00:00, 4.87it/s]
## Info for epoch 60 ##
val_loss : 10.9608
Pixel_Accuracy : 0.187
Mean_IoU : 0.004999999888241291
Class_IoU : {0: 0.004, 1: 0.187, 2: 0.001, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}
Performance didn't improve for 10 epochs Training Stoped `
The config file is the following:
`{ "name": "SegNet", "n_gpu": 1, "use_synch_bn": false,
"arch": {
"type": "SegNet",
"args": {
"backbone": "resnet50",
"freeze_bn": false,
"freeze_backbone": false
}
},
"train_loader": {
"type": "SUN",
"args":{
"data_dir": "./SUN-RGB",
"batch_size": 4,
"base_size": 480,
"crop_size": 300,
"augment": true,
"shuffle": false,
"scale": false,
"flip": false,
"rotate": false,
"blur": false,
"split": "training",
"num_workers": 8
}
},
"val_loader": {
"type": "SUN",
"args":{
"data_dir": "./SUN-RGB",
"batch_size": 4,
"crop_size": 480,
"val": true,
"split": "validation",
"num_workers": 4
}
},`
While the data loader is:
` def init(self, kwargs): self.num_classes = 37 self.palette = palette.get_voc_palette(self.num_classes) super(SUNDataset, self).init(kwargs)
#print (x)
def _set_files(self):
if self.split in ["training", "validation"]:
self.image_dir = os.path.join(self.root, 'images', self.split)
self.label_dir = os.path.join(self.root, 'annotations', self.split)
self.files = [os.path.basename(path).split('.')[0] for path in glob(self.image_dir + '/*.jpg')]
else: raise ValueError(f"Invalid split name {self.split}")
def _load_data(self, index):
image_id = self.files[index]
image_path = os.path.join(self.image_dir, image_id + '.jpg')
label_path = os.path.join(self.label_dir, image_id + '.png')
image = np.asarray(Image.open(image_path).convert('RGB'), dtype=np.float32)
label = np.asarray(Image.open(label_path), dtype=np.int32)
return image, label, image_id`
Your advice is highly appreciated. Thank you!
Did you solve this issue?
Hi,
Thank you for building this useful library. I've been trying to run the config file for SegNet on ADE20K as a first try because I would like to run the model on my own dataset. However, I've got the following error. I will appreciate your help.
The error:
Traceback (most recent call last): File "train.py", line 61, in <module> main(config, args.resume) File "train.py", line 22, in main train_loader = get_instance(dataloaders, 'train_loader', config) File "train.py", line 16, in get_instance return getattr(module, config[name]['type'])(*args, **config[name]['args']) TypeError: 'module' object is not callable
Here is the modified config file:
` { "name": "SegNet", "n_gpu": 1, "use_synch_bn": true,
} `
I'm not sure if I modified the file correctly. Thanks!