yassouali / pytorch-segmentation

:art: Semantic segmentation models, datasets and losses implemented in PyTorch.
MIT License
1.66k stars 379 forks source link

Unable to run the configurations for SegNet on ADE20K #56

Closed Njuod closed 4 years ago

Njuod commented 4 years ago

Hi,

Thank you for building this useful library. I've been trying to run the config file for SegNet on ADE20K as a first try because I would like to run the model on my own dataset. However, I've got the following error. I will appreciate your help.

The error: Traceback (most recent call last): File "train.py", line 61, in <module> main(config, args.resume) File "train.py", line 22, in main train_loader = get_instance(dataloaders, 'train_loader', config) File "train.py", line 16, in get_instance return getattr(module, config[name]['type'])(*args, **config[name]['args']) TypeError: 'module' object is not callable

Here is the modified config file:

` { "name": "SegNet", "n_gpu": 1, "use_synch_bn": true,

"arch": {
    "type": "SegNet",
    "args": {
        "backbone": "resnet50",
        "freeze_bn": false,
        "freeze_backbone": false
    }
},

"train_loader": {
    "type": "ade20k",
    "args":{
        "data_dir": "ADEChallengeData2016/images/training/",
        "batch_size": 8,
        "base_size": 400,
        "crop_size": 380,
        "augment": true,
        "shuffle": true,
        "scale": true,
        "flip": true,
        "rotate": true,
        "blur": false,
        "split": "train_aug",
        "num_workers": 8
    }
},

"val_loader": {
    "type": "ade20k",
    "args":{
        "data_dir": "ADEChallengeData2016/images/validation/",
        "batch_size": 8,
        "crop_size": 480,
        "val": true,
        "split": "val",
        "num_workers": 4
    }
},

"optimizer": {
    "type": "SGD",
    "differential_lr": true,
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"loss": "CrossEntropyLoss2d",
"ignore_index": 255,
"lr_scheduler": {
    "type": "Poly",
    "args": {}
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 10,

    "monitor": "max Mean_IoU",
    "early_stop": 10,

    "tensorboard": true,
    "log_dir": "saved/runs",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 5
}

} `

I'm not sure if I modified the file correctly. Thanks!

yassouali commented 4 years ago

Hi, I think you just need to change the name of the dataloader from ade20k to ADE20K

Njuod commented 4 years ago

Thanks!

I've modified the name and the names of the split and it is work at the beginning, but then I got another error.

The modified code: `{ "name": "SegNet", "n_gpu": 1, "use_synch_bn": true,

"arch": {
    "type": "SegNet",
    "args": {
        "backbone": "resnet50",
        "freeze_bn": false,
        "freeze_backbone": false
    }
},

"train_loader": {
    "type": "ADE20K",
    "args":{
        "data_dir": "..../pytorch_segmentation/ADEChallengeData2016",
        "batch_size": 4,
        "base_size": 400,
        "crop_size": 380,
        "augment": true,
        "shuffle": true,
        "scale": true,
        "flip": true,
        "rotate": true,
        "blur": false,
        "split": "training",
        "num_workers": 8
    }
},

"val_loader": {
    "type": "ADE20K",
    "args":{
        "data_dir": "..../pytorch_segmentation/ADEChallengeData2016",
        "batch_size": 2,
        "crop_size": 480,
        "val": true,
        "split": "validation",
        "num_workers": 4
    }
},

"optimizer": {
    "type": "SGD",
    "differential_lr": true,
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"loss": "CrossEntropyLoss2d",
"ignore_index": 255,
"lr_scheduler": {
    "type": "Poly",
    "args": {}
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 10,

    "monitor": "max Mean_IoU",
    "early_stop": 10,

    "tensorboard": true,
    "log_dir": "saved/runs",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 5
}

} `

The result:

`

SegNet( (stage1_encoder): Sequential( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage2_encoder): Sequential( (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage3_encoder): Sequential( (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage4_encoder): Sequential( (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage5_encoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (stage1_decoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage2_decoder): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage3_decoder): Sequential( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (8): ReLU(inplace=True) ) (stage4_decoder): Sequential( (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) ) (stage5_decoder): Sequential( (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): ReLU(inplace=True) (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (5): ReLU(inplace=True) (6): Conv2d(64, 150, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (unpool): MaxUnpool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)) ) Nbr of trainable parameters: 16396246

Detected GPUs: 1 Requested: 1

0%| | 0/10105 [00:00<?, ?it/s]C:\Program Files\Python36\lib\site-packages\torch\optim\lr_scheduler.py:122: UserWarning: Detected call of lr_scheduler.step() before optimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step() before lr_scheduler.step(). Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning) Traceback (most recent call last): File "train.py", line 62, in main(config, args.resume) File "train.py", line 43, in main trainer.train() File "C:\Users...\pytorch_segmentation\base\base_trainer.py", line 101, in train results = self._train_epoch(epoch) File "C:\Users...\pytorch_segmentation\trainer.py", line 56, in _train_epoch output = self.model(data) File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\module.py", line 532, in call result = self.forward(*input, kwargs) File "C:\Program Files\Python36\lib\site-packages\torch\nn\parallel\data_parallel.py", line 150, in forward return self.module(*inputs[0], *kwargs[0]) File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\module.py", line 532, in call result = self.forward(input, kwargs) File "C:\Users...\pytorch_segmentation\models\segnet.py", line 101, in forward x = self.stage5_decoder(x) File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\module.py", line 532, in call result = self.forward(*input, *kwargs) File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\container.py", line 100, in forward input = module(input) File "C:\Program Files\Python36\lib\site-packages\torch\nn\modules\module.py", line 532, in call result = self.forward(input, **kwargs) File "C:\Users...\pytorch_segmentation\utils\sync_batchnorm\batchnorm.py", line 68, in forward self.training, self.momentum, self.eps) File "C:\Program Files\Python36\lib\site-packages\torch\nn\functional.py", line 1670, in batch_norm training, momentum, eps, torch.backends.cudnn.enabled RuntimeError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 0; 2.00 GiB total capacity; 1.21 GiB already allocated; 38.50 MiB free; 1.24 GiB reserved in total by PyTorch)

`

I have also reduced the batch size and I still get the same error.

Sorry, I am new to PyTorch. So, do you have any idea? Thank you for your help.

yassouali commented 4 years ago

Hi, in this case you run out of GPU memory, try to either reduce the batch size or the crop size

Njuod commented 4 years ago

Hi yassouali,

Thank you for your help.

I'm trying to create my own dataloader, but I have an issue with the labels and the paletted files. Could you explain these tow folders especially the (paletted.py)?

My dataset has only two classes (background, not-background)

Thanks again!

yassouali commented 4 years ago

Hi,

Better to ignore them both, the labels are just the class names for a given ID for the datasets provided, for paletted, it is just the colors for each class for each dataset.

To simplify things, all you need to do is similar to VOC dataloader, use the same palette as in VOC dataset, and only change the number of classes and how you load the image and the labels.

Good luck.

Njuod commented 4 years ago

Because my dataset file has not data augmentation and is similar to ADE20K Dataset file, I've loaded the image and the labels in the same way. I've also used the same palette as in the VOC dataset as you suggested. The code looks as follows:

`
class DATADataset(BaseDataSet):

def __init__(self, **kwargs):
    self.num_classes = 2
    self.palette = palette.get_voc_palette(self.num_classes)
    super(DATADataset, self).__init__(**kwargs)

def _set_files(self):
    if self.split in  ["training", "validation"]:
        self.image_dir = os.path.join(self.root, 'images', self.split)
        self.label_dir = os.path.join(self.root, 'annotations', self.split)
        self.files = [os.path.basename(path).split('.')[0] for path in glob(self.image_dir + '/*.jpg')]
    else: raise ValueError(f"Invalid split name {self.split}")

def _load_data(self, index):
    image_id = self.files[index]
    image_path = os.path.join(self.image_dir, image_id + '.jpg')
    label_path = os.path.join(self.label_dir, image_id + '.png')
    image = np.asarray(Image.open(image_path).convert('RGB'), dtype=np.float32)
    label = np.asarray(Image.open(label_path), dtype=np.int32) - 1 # from -1 to 149
    return image, label, image_id

 class DATA(BaseDataLoader):

 def __init__(self, data_dir, batch_size, split, crop_size=None, base_size=None, scale=True, num_workers=1, val=False,
                shuffle=False, flip=False, rotate=False, blur= False, augment=False, val_split= None, return_id=False):

    self.MEAN = [0.48897059, 0.46548275, 0.4294]
    self.STD = [0.22861765, 0.22948039, 0.24054667]

    kwargs = {
        'root': data_dir,
        'split': split,
        'mean': self.MEAN,
        'std': self.STD,
        'augment': augment,
        'crop_size': crop_size,
        'base_size': base_size,
        'scale': scale,
        'flip': flip,
        'blur': blur,
        'rotate': rotate,
        'return_id': return_id,
        'val': val
    }

    self.dataset = DATADataset(**kwargs)
    super(DATA, self).__init__(self.dataset, batch_size, shuffle, num_workers, val_split)`

Then, I got the following error:

AttributeError: module 'dataloaders' has no attribute 'DATA'

So, I added from .data import DATA to __init__.py

Then, I got this error:

RuntimeError: cuda runtime error (710) : device-side assert triggered at C:/w/1/s/windows/pytorch/aten/src/ATen/native/cuda/SoftMax.cu:651

Therefore, I modified the following line code and remove -1 label = np.asarray(Image.open(label_path), dtype=np.int32) # - 1 # from -1 to 149 and I changed the value of "ignore_index" to 255.

Is what I did right?

I'm sorry for bothering you

Njuod commented 4 years ago

Here is the beginning of training:

packages\torch\optim\lr_scheduler.py:122: UserWarning: Detected call oflr_scheduler.step()beforeoptimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order:optimizer.step()beforelr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning) TRAIN (1) | Loss: 0.001 | Acc 0.70 mIoU 0.35 | B 0.60 D 0.02 |: 100%|█████████████████████████| 3063/3063 [30:44<00:00, 1.31it/s]

TRAIN (2) | Loss: 0.000 | Acc 0.70 mIoU 0.35 | B 0.61 D 0.03 |: 100%|█████████████████████████| 3063/3063 [31:04<00:00, 1.20it/s]

TRAIN (3) | Loss: 0.000 | Acc 0.70 mIoU 0.35 | B 0.64 D 0.03 |: 54%|█████████████▍ | 1641/3063 [17:32<14:10, 1.67it/s]`

I do not know what is wrong

yassouali commented 4 years ago

Hi, yes what you did is correct, the classes in your case are 0 and 1 with no ignored labels.

What is the current problems, is it the performances, try different models if this is the case.

Njuod commented 4 years ago

I am still trying to run your codes on my dataset. But the result is still illogical and I think I need to make some adjustments. If I have masks that contain only one class, meaning 0 indicates the background value and 255 for the required class value, does this mean that the result will be affected if: "ignore_index": 255, as the target value will be ignored. I have tried to remove it but I got an error because of that. Can you explain more about the variable "ignore_index": 255, ? Also, what is the difference between converting the data type of the labels to float32 or int32?

yassouali commented 4 years ago

@Njuod Yes, if you only have two classes, better to use 0 and 1 as your labels.

For certain datasets, there are some regions in the image that were not annotated, for examples, the borders in PASCAL VOC were not annotated, and in the labels, these regions are annotated as 255, so when training the model, we also want to ignore these regions, and not backpropagate the loss from them. This is why ignore_index is used so that when computing the cross-entropy, we pass the correct ignore label index. So now, if you passed one of your classes as ignore index, you can imaging that the model will be random in these regions, given that it was never trained on these classes.

sophiatmu commented 4 years ago

Hi, I've been trying to run the config file for SegNet on ADE20K as a try too. However, I've got the same error, and the config.json is as the same as @Njuod . And I have modified the following line code and remove -1 label = np.asarray(Image.open(label_path), dtype=np.int32) # - 1 # from -1 to 149 I saw the ignore_index is 255 already, so I didn't replace it. But still get the following error

/home/muyun/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py:82: UserWarning: Detected call of lr_scheduler.step() before optimizer.step(). In PyTorch 1.1.0 and later, you should call them in the opposite order: optimizer.step() before lr_scheduler.step(). Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning) TRAIN (1) | Loss: 5.406 | Acc 0.03 mIoU 0.00 | B 2.85 D 1.39 |: 0%| | 5/5053 [00:14<4:42:06, 3.35s/it]/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T , int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [160,0,0] Assertion t >= 0 && t < n_classes failed. /pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T , int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [161,0,0] Assertion t >= 0 && t < n_classes failed. .... ... ... /pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [159,0,0] Assertion t >= 0 && t < n_classes failed. THCudaCheck FAIL file=/pytorch/aten/src/THC/THCCachingHostAllocator.cpp line=296 error=59 : device-side assert triggered Traceback (most recent call last): File "train.py", line 61, in main(config, args.resume) File "train.py", line 42, in main trainer.train() File "/home/muyun/muyun/pytorch_segmentation/base/base_trainer.py", line 101, in train results = self._train_epoch(epoch) File "/home/muyun/muyun/pytorch_segmentation/trainer.py", line 72, in _train_epoch self.total_loss.update(loss.item()) RuntimeError: CUDA error: device-side assert triggered

could U help me plz.

dan1elR commented 4 years ago

Hi, I ran the UNet on a custom Dataset with its own Dataset and Dataloader (adapted from VOC) and got the above error, too. Is there any solution yet? Thanks for helping!

sophiatmu commented 4 years ago

@Murmeltier105 seems not...

yassouali commented 4 years ago

For ADE, the ignored class id -1 instead of 255, this is why threre is a -1, so that the unlabeled regions of class 0 becomes -1, all you need to do is set ignore index to -1 in the config file and it should work, but keep -1 in the code.

yassouali commented 4 years ago

Hi, I ran the UNet on a custom Dataset with its own Dataset and Dataloader (adapted from VOC) and got the above error, too. Is there any solution yet? Thanks for helping!

Hi, sorry for the late reply, can you post the code with some info / exemples about the form of your data

dan1elR commented 4 years ago

The dataloader is at: https://github.com/Murmeltier105/CRC_segmentation/blob/master/dataloaders/crc.py It's added to the dataloaders/init.py as well. For the config I used: https://github.com/Murmeltier105/CRC_segmentation/blob/master/config.json I got Filelists for train,val,test with content like: TO5849297(1,00,37500,15500,500,500) with everything before the Brace is the folder of a tiled image with tile size 500*500. Images (jpg) and labels (png) are in the same folder.

The error output looks like: `/home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint8 = np.dtype([("qint8", np.int8, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_quint8 = np.dtype([("quint8", np.uint8, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint16 = np.dtype([("qint16", np.int16, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_quint16 = np.dtype([("quint16", np.uint16, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. _np_qint32 = np.dtype([("qint32", np.int32, 1)]) /home/dr1/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'. np_resource = np.dtype([("resource", np.ubyte, 1)]) Detected GPUs: 1 Requested: 1

TRAIN (1) | Loss: 1.747 | Acc 0.30 mIoU 0.10 | B 0.31 D 0.06 |: 1%|▏ | 22/3458 [00:06<08:10, 7.00it/s]/pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [128,0,0] Assertion t >= 0 && t < n_classes failed.

[...] /pytorch/aten/src/THCUNN/SpatialClassNLLCriterion.cu:103: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T , T , T , long , T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [0,0,0], thread: [895,0,0] Assertion t >= 0 && t < n_classes failed. Traceback (most recent call last): File "/home/dr1/Dokumente/pytorch_segmentation/train.py", line 61, in main(config, args.resume) File "/home/dr1/Dokumente/pytorch_segmentation/train.py", line 42, in main trainer.train() File "/home/dr1/Dokumente/pytorch_segmentation/base/base_trainer.py", line 101, in train results = self._train_epoch(epoch) File "/home/dr1/Dokumente/pytorch_segmentation/trainer.py", line 72, in _train_epoch self.total_loss.update(loss.item()) RuntimeError: CUDA error: device-side assert triggered`

Do I have to use the ignore_index of the loss in the config file? Because up to now there is no class I want to leave out.

Thanks for your help! TO58492_97_(1,00,21000,15000,500,500)-labels TO58492_97_(1,00,21000,15500,500,500)

yassouali commented 4 years ago

Thanks for the code, the problem is that for one of your labels, the class ID is > 5 or < 0 (because you have 6 classes), what you can do is add an assert to find the root of the problem, then you can find the value of the id class that causes the error and add it as an ignore class.

add this after label = np.asarray(Image.open(label_path), dtype=np.int32): assert img.max() < 6 and img.min() >= 0, f"the ID cause problem is one of {img.max()}, {img.min()} of image {image_id}"

dan1elR commented 4 years ago

Thank you so much for your help! It works. label.max was at 6. Will there always be an excess label, or rather where does is come from?

yassouali commented 4 years ago

This specific to your dataset, maybe you have 7 classes and not 6.

Njuod commented 4 years ago

Hi,

could you please let me know on which dataset the SegNet was trained. The issue is whatever the dataset I used to train the model, the mean iou for validation dataset becomes 0.

I have Also trained the SegNet on the SUN-RGB dataset which mentioned in their own paper but I got the following result:

` TRAIN (46) | Loss: 0.752 | Acc 0.76 mIoU 0.40 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.90it/s]

TRAIN (47) | Loss: 0.736 | Acc 0.76 mIoU 0.41 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (48) | Loss: 0.718 | Acc 0.77 mIoU 0.42 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (49) | Loss: 0.695 | Acc 0.78 mIoU 0.44 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (50) | Loss: 0.681 | Acc 0.78 mIoU 0.45 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

EVALUATION

EVAL (50) | Loss: 9.292, PixelAcc: 0.18, Mean IoU: 0.00 |: 100%|██████████████████████████████| 1263/1263 [04:12<00:00, 5.00it/s]

     ## Info for epoch 50 ## 
     val_loss       : 9.29229
     Pixel_Accuracy : 0.184
     Mean_IoU       : 0.004999999888241291
     Class_IoU      : {0: 0.002, 1: 0.187, 2: 0.001, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.005, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}

Saving a checkpoint: saved/SegNet/05-12_04-36/checkpoint-epoch50.pth ...

TRAIN (51) | Loss: 0.650 | Acc 0.79 mIoU 0.47 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (52) | Loss: 0.633 | Acc 0.79 mIoU 0.48 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (53) | Loss: 0.623 | Acc 0.80 mIoU 0.49 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (54) | Loss: 0.607 | Acc 0.80 mIoU 0.50 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (55) | Loss: 0.591 | Acc 0.81 mIoU 0.51 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

EVALUATION

EVAL (55) | Loss: 10.134, PixelAcc: 0.19, Mean IoU: 0.00 |: 100%|█████████████████████████████| 1263/1263 [04:12<00:00, 5.00it/s]

     ## Info for epoch 55 ## 
     val_loss       : 10.13353
     Pixel_Accuracy : 0.187
     Mean_IoU       : 0.004999999888241291
     Class_IoU      : {0: 0.003, 1: 0.187, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}

TRAIN (56) | Loss: 0.562 | Acc 0.82 mIoU 0.54 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (57) | Loss: 0.559 | Acc 0.82 mIoU 0.54 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (58) | Loss: 0.546 | Acc 0.82 mIoU 0.55 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (59) | Loss: 0.524 | Acc 0.83 mIoU 0.56 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:38<00:00, 3.91it/s]

TRAIN (60) | Loss: 0.510 | Acc 0.83 mIoU 0.57 | B 0.26 D 0.00 |: 100%|████████████████████████| 1322/1322 [05:42<00:00, 3.86it/s]^[OP

EVALUATION

EVAL (60) | Loss: 10.961, PixelAcc: 0.19, Mean IoU: 0.00 |: 100%|█████████████████████████████| 1263/1263 [04:19<00:00, 4.87it/s]

     ## Info for epoch 60 ## 
     val_loss       : 10.9608
     Pixel_Accuracy : 0.187
     Mean_IoU       : 0.004999999888241291
     Class_IoU      : {0: 0.004, 1: 0.187, 2: 0.001, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0}

Performance didn't improve for 10 epochs Training Stoped `

The config file is the following:

`{ "name": "SegNet", "n_gpu": 1, "use_synch_bn": false,

"arch": {
    "type": "SegNet",
    "args": {
        "backbone": "resnet50",
        "freeze_bn": false,
        "freeze_backbone": false
    }
},

"train_loader": {
    "type": "SUN",
    "args":{
        "data_dir": "./SUN-RGB",
        "batch_size": 4,
        "base_size": 480,
        "crop_size": 300,
        "augment": true,
        "shuffle": false,
        "scale": false,
        "flip": false,
        "rotate": false,
        "blur": false,
        "split": "training",
        "num_workers": 8
    }
},

"val_loader": {
    "type": "SUN",
    "args":{
        "data_dir": "./SUN-RGB",
        "batch_size": 4,
        "crop_size": 480,
        "val": true,
        "split": "validation",
        "num_workers": 4
    }
},`

While the data loader is:

` def init(self, kwargs): self.num_classes = 37 self.palette = palette.get_voc_palette(self.num_classes) super(SUNDataset, self).init(kwargs)

x = self.palette

    #print (x)

def _set_files(self):
    if self.split in  ["training", "validation"]:
        self.image_dir = os.path.join(self.root, 'images', self.split)
        self.label_dir = os.path.join(self.root, 'annotations', self.split)
        self.files = [os.path.basename(path).split('.')[0] for path in glob(self.image_dir + '/*.jpg')]
    else: raise ValueError(f"Invalid split name {self.split}")

def _load_data(self, index):
    image_id = self.files[index]
    image_path = os.path.join(self.image_dir, image_id + '.jpg')
    label_path = os.path.join(self.label_dir, image_id + '.png')
    image = np.asarray(Image.open(image_path).convert('RGB'), dtype=np.float32)
    label = np.asarray(Image.open(label_path), dtype=np.int32) 

    return image, label, image_id`

Your advice is highly appreciated. Thank you!

sarathsrk commented 4 years ago

Did you solve this issue?