bcmi / SLBR-Visible-Watermark-Removal

[ACM MM 2021] Visible Watermark Removal via Self-calibrated Localization and Background Refinement
222 stars 35 forks source link

Error when training with just 10 examples #32

Closed ChristianMeli closed 1 year ago

ChristianMeli commented 1 year ago

Hello! Thank you for your work. It's a great contribution.

I’m training the model based on what you got on train.py. I’m doing it on a small subset of the CLWD dataset, because I’m trying to overfit the model (it’s just a check of consistency, I want to see that the model learns).

If I use a small sample, 10 examples (so 10 images on each of the dedicated folder: Watermarked_images, Watermark_free_images, etc) and run the training loop I get an error when the batching occurs. It’s an OpenCV error which seems to be related to the train_loader generator.

If I use 200 examples (as I'm using in #31) , I don’t get the problem.

CODE

ARGS = dict(checkpoint=CHECKPOINT_DIR, crop_size=CROP_SIZE, dataset='clwd', dataset_dir=DATASET_DIR, debug=False, epochs=21, evaluate=False,
freq=-1, lr=0.01, schedule=[100,200,300], gamma=0.1,
sigma_decay=0,
resume=RESUME_DIR, start_epoch=0, nets='slbr', test_dir=TEST_DIR, train_batch=2, dlr=0.001, data='', data_augumentation=False, finetune='', flip=False, alpha=0.5, beta1=0.9, beta2=0.999, bg_mode='res_mask', gan_norm=False, gpu=True, gpu_id='0', hl=False, input_size=256, k_center=2, k_refine=3, k_skip_stage=3, lambda_content=0, lambda_iou=0, lambda_l1=4, lambda_mask=1, lambda_primary=0.01, lambda_style=0, loss_type='l2', mask_mode='res', masked=False, momentum=0, name='slbr_v1', no_flip=True, normalized_input=False, preprocess='resize', project_mode='simple', requires_grad=False, res=False, sim_metric='cos', sltype='vggx', test_batch=1,
use_refine=True, weight_decay=0, workers=1)

class DictArgs(object): def init(self, d): for k, v in d.items(): setattr(self, k, v)

ARGS = DictArgs(ARGS)

In `media/10CLWD-train-equal-test` I got a copy of the CLWD folder structure with just 10 examples, also with the test and train folders being equal. I'm not interested in testing performance on unseen data at this point, the test folder is there just so the code in the repository doesn't break.

- TRAINING LOOP

from future import print_function, absolute_import

import argparse import torch,time,os

torch.backends.cudnn.benchmark = True

from src.utils.misc import save_checkpoint, adjust_learning_rate import src.models as models

import datasets as datasets from options import Options import numpy as np

def train(args): args.seed = 1 np.random.seed(args.seed) torch.manual_seed(args.seed) MODEL_NAME = "SLBR"

args.dataset = args.dataset.lower()
if args.dataset == 'clwd':
    dataset_func = datasets.CLWDDataset
elif args.dataset == 'lvw':
    dataset_func = datasets.LVWDataset
else:
    raise ValueError("Not known dataset:\t{}".format(args.dataset))

train_loader = torch.utils.data.DataLoader(dataset_func('train',args),batch_size=args.train_batch, shuffle=True,
    num_workers=args.workers, pin_memory=True)

val_loader = torch.utils.data.DataLoader(dataset_func('val',args),batch_size=args.test_batch, shuffle=False,
    num_workers=args.workers, pin_memory=True)

lr = args.lr
data_loaders = (train_loader,val_loader)

model = models.dict[args.models](datasets=data_loaders, args=args)

model = models.__dict__[MODEL_NAME](datasets=data_loaders, args=ARGS)
print('============================ Initization Finish && Training Start =============================================')
print(f"It will start in epoch: {model.args.start_epoch}\nIt will end in epoch: {model.args.epochs}")
for epoch in range(model.args.start_epoch, model.args.epochs):
    lr = adjust_learning_rate(data_loaders, model, epoch, lr, args)

    print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

    model.record('lr',lr, epoch)        
    model.train(epoch)

    # save model
    save_epochs = {1,10,20,50,100,200,300,400}
    if epoch in save_epochs:
        model.validate(epoch)
        model.flush() 

        print(f"Saving checkpoint of epoch {epoch}")
        model.save_checkpoint(filename=f"checkpoint{epoch}.pth.tar")

    # model.validate(epoch)

if args.freq < 0:

model.validate(epoch)

model.flush()

model.save_checkpoint()

return model

final_model = train(ARGS)

I get the following output with the error as soon as training starts

==> creating model ==> creating model [Finish] ==> Total params: 21.39M ==> Total devices: 1 ==> Current Checkpoint: checkpoint/slbr_v1 ============================ Initization Finish && Training Start ============================================= It will start in epoch: 0 It will end in epoch: 21

Epoch: 1 | LR: 0.01000000

error Traceback (most recent call last) /tmp/ipykernel_32332/965729503.py in ----> 1 final_model = train(ARGS)

/tmp/ipykernel_32332/1760790972.py in train(args) 46 47 model.record('lr',lr, epoch) ---> 48 model.train(epoch) 49 50

/alloc/data/fury_watermark-remover-fda/discovery/SLBR-Visible-Watermark-Removal-master/src/models/SLBR.py in train(self, epoch) 104 end = time.time() 105 bar = Bar('Processing {} '.format(self.args.nets), max=len(self.train_loader)) --> 106 for i, batches in enumerate(self.train_loader): 107 current_index = len(self.train_loader) * epoch + i 108

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self) 433 if self._sampler_iter is None: 434 self._reset() --> 435 data = self._next_data() 436 self._num_yielded += 1 437 if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self) 1083 else: 1084 del self._task_info[idx] -> 1085 return self._process_data(data) 1086 1087 def _try_put_index(self):

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _process_data(self, data) 1109 self._try_put_index() 1110 if isinstance(data, ExceptionWrapper): -> 1111 data.reraise() 1112 return data 1113

/usr/local/lib/python3.7/site-packages/torch/_utils.py in reraise(self) 426 # have message field 427 raise self.exc_type(message=msg) --> 428 raise self.exc_type(msg) 429 430

error: Caught error in DataLoader worker process 0. Original Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop data = fetcher.fetch(index) File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in data = [self.dataset[idx] for idx in possibly_batched_index] File "/alloc/data/fury_watermark-remover-fda/discovery/SLBR-Visible-Watermark-Removal-master/datasets/clwd_dataset.py", line 82, in getitem sample = self.get_sample(index) File "/alloc/data/fury_watermark-remover-fda/discovery/SLBR-Visible-Watermark-Removal-master/datasets/clwd_dataset.py", line 63, in get_sample img_J = cv2.cvtColor(img_J, cv2.COLOR_BGR2RGB) cv2.error: OpenCV(4.2.0) /io/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


I've narrowed it down to a problem on the batching, as the stack trace suggests. I tried a minimal example just initialising the `train_loader` and iterating over the batches. The same happens. I print the path being processed.

args = ARGS train_loader = torch.utils.data.DataLoader(datasets.CLWDDataset("train", args),batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True)

for i, batch in enumerate(train_loader): print(i,batch["img_path"],"\n")

And get this output with the error mentioned

0 ['media/10CLWD-train-equal-test/train/Watermarked_image/8.jpg', 'media/10CLWD-train-equal-test/train/Watermarked_image/10.jpg']

1 ['media/10CLWD-train-equal-test/train/Watermarked_image/5.jpg', 'media/10CLWD-train-equal-test/train/Watermarked_image/3.jpg']

2 ['media/10CLWD-train-equal-test/train/Watermarked_image/1.jpg', 'media/10CLWD-train-equal-test/train/Watermarked_image/6.jpg']

3 ['media/10CLWD-train-equal-test/train/Watermarked_image/4.jpg', 'media/10CLWD-train-equal-test/train/Watermarked_image/2.jpg']


error Traceback (most recent call last) /tmp/ipykernel_32332/1400065821.py in 3 num_workers=args.workers, pin_memory=True) 4 ----> 5 for i, batch in enumerate(train_loader): 6 print(i,batch["img_path"],"\n")

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self) 433 if self._sampler_iter is None: 434 self._reset() --> 435 data = self._next_data() 436 self._num_yielded += 1 437 if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self) 1083 else: 1084 del self._task_info[idx] -> 1085 return self._process_data(data) 1086 1087 def _try_put_index(self):

/usr/local/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _process_data(self, data) 1109 self._try_put_index() 1110 if isinstance(data, ExceptionWrapper): -> 1111 data.reraise() 1112 return data 1113

/usr/local/lib/python3.7/site-packages/torch/_utils.py in reraise(self) 426 # have message field 427 raise self.exc_type(message=msg) --> 428 raise self.exc_type(msg) 429 430

error: Caught error in DataLoader worker process 0. Original Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop data = fetcher.fetch(index) File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/usr/local/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in data = [self.dataset[idx] for idx in possibly_batched_index] File "/alloc/data/fury_watermark-remover-fda/discovery/SLBR-Visible-Watermark-Removal-master/datasets/clwd_dataset.py", line 82, in getitem sample = self.get_sample(index) File "/alloc/data/fury_watermark-remover-fda/discovery/SLBR-Visible-Watermark-Removal-master/datasets/clwd_dataset.py", line 63, in get_sample img_J = cv2.cvtColor(img_J, cv2.COLOR_BGR2RGB) cv2.error: OpenCV(4.2.0) /io/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


Some images do get processed, but it fails with the last batch. The images missing on this example (`7.jpg` and `9.jpg`) are on the folder, double checked. Have you run into anything similar?

Thanks for the support
jimleungjing commented 1 year ago

It seems that the input img_J is empty. Just print the logs around the error and you will see it.