nitsaick / kits19-challenge

Kidney Tumor Segmentation Challenge 2019
MIT License
52 stars 19 forks source link

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <U14 #14

Closed danielbellhv closed 3 years ago

danielbellhv commented 3 years ago

I am new to using PyTorch, which comes with a lot of new errors that I haven't seen before.

Error occurs on trainer.fit(model, dm).

# HERE is commented where my function is called and where there is a PyTorch error.

Note: the for loop is for record training time 10 times over as an average. I want to document training and prediction times across different datasets.

Invocation:

# re-train cell
args = """
      --max_epochs 20
      --progress_bar_refresh_rate 2
      --gradient_clip_val 0.5
      --log_gpu_memory True
      --gpus 1
    """.split()

for i in range(10):
  start_time = time.time()
  run_training(args) # HERE
  print("--- %s seconds ---" % (time.time() - start_time))

run_training():

def run_training(input=None):
    args = parse_args(input)
    pl.seed_everything(args.seed)
    module = importlib.import_module('pytorch_lightning.loggers')
    logger = getattr(module, args.logging)(save_dir='logs')
    csv_logger = pl.loggers.CSVLogger(save_dir=f'{args.modeldir}/csv_logs')
    loggers = [logger, csv_logger]
    dm = OntologyTaggerDataModule.from_argparse_args(args)
    if args.model_uri and len(args.checkpointfile) > 1:
        local_model_uri = os.environ.get('SM_CHANNEL_MODEL', '.')
        tar_path = os.path.join(local_model_uri,  'model.tar.gz')
        tar = tarfile.open(tar_path, "r:gz")
        tar.extractall(local_model_uri)
        tar.close()
        model_path = os.path.join(local_model_uri, args.checkpointfile)
        model = OntologyTaggerModel.load_from_checkpoint(model_path)        
    elif os.path.isfile(os.path.join(args.traindir, args.checkpointfile)):
        file_path = os.path.join(args.traindir, args.checkpointfile)
        model = OntologyTaggerModel.load_from_checkpoint(file_path)
    else:    
        model = OntologyTaggerModel(**vars(args), num_classes=dm.num_classes, class_map=dm.class_map)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        args.checkpointdir, save_last=True, save_weights_only=True)

    checkpoint_dir = os.environ.get('SM_HP_CHECKPOINTDIR', './')
    if checkpoint_dir != './':
        labels_file_orig = os.path.join(checkpoint_dir, args.labels)
        labels_file_cp = os.path.join(args.modeldir, os.path.basename(args.labels))
        shutil.copyfile(labels_file_orig, labels_file_cp)

    trainer = pl.Trainer.from_argparse_args(args, callbacks=[checkpoint_callback], logger=loggers)
    trainer.fit(model, dm) # HERE
    model_file = os.path.join(args.modeldir, 'last.ckpt')
    trainer.save_checkpoint(model_file, weights_only=True)

Error Traceback:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-847851467cf0> in <module>()
     10 for i in range(10):
     11   start_time = time.time()
---> 12   run_training(args)
     13   print("--- %s seconds ---" % (time.time() - start_time))

<ipython-input-6-7f8e9eed480d> in run_training(input)
     68 
     69     trainer = pl.Trainer.from_argparse_args(args, callbacks=[checkpoint_callback], logger=loggers)
---> 70     trainer.fit(model, dm)
     71     model_file = os.path.join(args.modeldir, 'last.ckpt')
     72     trainer.save_checkpoint(model_file, weights_only=True)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    497 
    498         # dispath `start_training` or `start_testing` or `start_predicting`
--> 499         self.dispatch()
    500 
    501         # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
    544 
    545         else:
--> 546             self.accelerator.start_training(self)
    547 
    548     def train_or_test_or_predict(self):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
     71 
     72     def start_training(self, trainer):
---> 73         self.training_type_plugin.start_training(trainer)
     74 
     75     def start_testing(self, trainer):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
    112     def start_training(self, trainer: 'Trainer') -> None:
    113         # double dispatch to initiate the training loop
--> 114         self._results = trainer.run_train()
    115 
    116     def start_testing(self, trainer: 'Trainer') -> None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
    605             self.progress_bar_callback.disable()
    606 
--> 607         self.run_sanity_check(self.lightning_module)
    608 
    609         # set stage for logging

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in run_sanity_check(self, ref_model)
    858 
    859             # run eval step
--> 860             _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
    861 
    862             self.on_sanity_check_end()

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, max_batches, on_epoch)
    710             dl_max_batches = self.evaluation_loop.max_batches[dataloader_idx]
    711 
--> 712             for batch_idx, batch in enumerate(dataloader):
    713                 if batch is None:
    714                     continue

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
     82             raise RuntimeError('each element in list of batch should be of equal size')
     83         transposed = zip(*batch)
---> 84         return [default_collate(samples) for samples in transposed]
     85 
     86     raise TypeError(default_collate_err_msg_format.format(elem_type))

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py in <listcomp>(.0)
     82             raise RuntimeError('each element in list of batch should be of equal size')
     83         transposed = zip(*batch)
---> 84         return [default_collate(samples) for samples in transposed]
     85 
     86     raise TypeError(default_collate_err_msg_format.format(elem_type))

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
     60             # array of string classes and object
     61             if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
---> 62                 raise TypeError(default_collate_err_msg_format.format(elem.dtype))
     63 
     64             return default_collate([torch.as_tensor(b) for b in batch])

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <U14
ara7 commented 2 years ago

How did you solve this? I am getting the same error

B-Gendron commented 1 year ago

@danielbellhv I get the same error too and none of the answers I could find on the web helped. Can you explain how you solved it? At least I would be interested to know what this <U14 means.

danielbellhv commented 1 year ago

@B-Gendron Hey, sorry I closed it but never solved the error. I've since moved on to other projects, so I wouldn't be able to work out a solution

danielbellhv commented 1 year ago

Maybe the error occurs because they all need to be the same type, not just arrays and tensors - but all arrays or all tensors

B-Gendron commented 1 year ago

The problem is that I have the same issue even if all my data has the same type. Here is the code of my dataset class:

class SentenceEmotionDatasetBERT(Dataset):
    def __init__(self, data, args):
        self.args = args
        self.data = data

    def __len__(self):
        if len(self.data) < 2000:
            return 1000
        else:
            return len(self.data)

    def __getitem__(self, idx):
        item = {
            "label" : np.array(self.data[idx]["label"]),
            "dialog" : np.array(self.data[idx]["dialog"]),
            "embedding" : np.array(self.data[idx]["embedding"]),
            "encoding" : np.array(self.data[idx]["encoding"])
        }

        return item

Therefore, it is all numpy arrays but I still have the error:

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <U64

Note that the last two digits change each time I execute the code, it is very strange I don't know what this means.

danielbellhv commented 1 year ago

I think U64 or U14 etc. refers to numpy's Unicode string of length 64 or 14, so our data would be of type numpy.str_ or numpy.unicode_, which PyTorch's DataLoader() can't handle.

danielbellhv commented 1 year ago

There's a concept called Word Embeddings, which converts text into numeric values for DNN. Idk your level of proficiency, but would defo look into that further, as it will relate

danielbellhv commented 1 year ago

How did you solve this? I am getting the same error

Sorry @ara7 for not seeing your comment. Did you ever resolve this for yourself? I don't normally leave posts one forums unanswered

B-Gendron commented 1 year ago

Actually I work in NLP so I'm completely aware of Embeddings. The fact is that I wanted to keep my original dialog for further use in a qualitative analysis. But you are entirely right: the problem is located at the string variables. If I ignore my dialog key, it works perfectly! Thanks a lot!

danielbellhv commented 1 year ago

Amazing, so quickly dealt with! I was far too in the trenches with NLP, but have since moved on to Computer Vision.

The real issue with these PyTorch posts is almost anything can solve the exact same error, and is so difficult to reproduce.

Glad I could help. Let me know if you've any other issue