Closed skr3178 closed 1 year ago
Maybe check this one and see if that helps: https://www.kaggle.com/code/ratthachat/proteinbert-lightning-multitasks
Thank you for sharing this. It seems to have some issues with dependencies when running on kaggle. I will spend some time on resolving this. If there is a quick workaround such as some setting change it would be great. Using original environment.
Notebook: "ProtBert-BFD-FineTuning-PyTorchLightning-Localization"
First I got error with "typeError: init() got an unexpected keyword argument 'gradient_checkpointing".
After setting it to false and then switching to true
parser.add_argument( "--gradient_checkpointing", (hash) default=False, default=True, (hash) skr edited type=bool, help="Enable or disable gradient checkpointing which use the cpu memory \ with the gpu memory to store the model.", ) return parser
.On running code:
trainer.fit(model)
I get new error.: Exception: Label encoder found an unknown label.
Name | Type | Params
0 | metric_acc | Accuracy | 0
1 | ProtBertBFD | BertModel | 419 M 2 | classification_head | Sequential | 40 K
3 | _loss | CrossEntropyLoss | 0
Epoch 0: 1% 108/11179 [00:18<32:24, 5.69it/s, loss=2.271, v_num=4-32, train_loss=2.09] ['Cytoplasm-Nucleus']
Exception Traceback (most recent call last) /tmp/ipykernel_16029/2717931281.py in
2 # 6 START TRAINING
3 # ------------------------
----> 4 trainer.fit(model)
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/states.py in wrapped_fn(self, *args, *kwargs) 46 if entering is not None: 47 self.state = entering ---> 48 result = fn(self, args, **kwargs) 49 50 # The INTERRUPTED state can be set inside the run function. To indicate that run was interrupted
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule) 1062 self.accelerator_backend = DataParallelBackend(self) 1063 self.accelerator_backend.setup(model) -> 1064 results = self.accelerator_backend.train() 1065 self.accelerator_backend.teardown() 1066
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/accelerators/dp_backend.py in train(self) 95 def train(self): 96 model = self.trainer.model ---> 97 results = self.trainer.run_pretrain_routine(model) 98 return results 99
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_pretrain_routine(self, model) 1237 1238 # CORE TRAINING LOOP -> 1239 self.train() 1240 1241 def _run_sanity_check(self, ref_model, model):
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in train(self) 392 # RUN TNG EPOCH 393 # ----------------- --> 394 self.run_training_epoch() 395 396 if self.max_steps and self.max_steps <= self.global_step:
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self) 477 # run epoch 478 for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable( --> 479 enumerate(_with_is_last(train_dataloader)), "get_train_batch" 480 ): 481 # stop epoch if we limited the number of training batches
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/profiler/profilers.py in profile_iterable(self, iterable, action_name) 76 try: 77 self.start(action_name) ---> 78 value = next(iterator) 79 self.stop(action_name) 80 yield value
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in _with_is_last(iterable) 1321 it = iter(iterable) 1322 last = next(it) -> 1323 for val in it: 1324 # yield last and has next 1325 yield last, False
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self) 626 # TODO(https://github.com/pytorch/pytorch/issues/76750) 627 self._reset() # type: ignore[call-arg] --> 628 data = self._next_data() 629 self._num_yielded += 1 630 if self._dataset_kind == _DatasetKind.Iterable and \
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self) 1331 else: 1332 del self._task_info[idx] -> 1333 return self._process_data(data) 1334 1335 def _try_put_index(self):
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _process_data(self, data) 1357 self._try_put_index() 1358 if isinstance(data, ExceptionWrapper): -> 1359 data.reraise() 1360 return data 1361
~/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/_utils.py in reraise(self) 541 # instantiate since we don't know how to 542 raise RuntimeError(msg) from None --> 543 raise exception 544 545
Exception: Caught Exception in DataLoader worker process 5. Original Traceback (most recent call last): File "/tmp/ipykernel_16029/1817995213.py", line 182, in prepare_sample targets = {"labels": self.label_encoder.batch_encode(sample["label"])} File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torchnlp/encoders/label_encoder.py", line 100, in batch_encode return torch.stack(super().batch_encode(iterator, *args, kwargs), dim=dim) File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torchnlp/encoders/encoder.py", line 42, in batchencode return [self.encode(object, *args, *kwargs) for object in iterator] File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torchnlp/encoders/encoder.py", line 42, in
return [self.encode(object , args, kwargs) for object_ in iterator]
File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torchnlp/encoders/label_encoder.py", line 87, in encode
return torch.tensor(self.token_to_index.get(label, self.unknown_index))
RuntimeError: Could not infer dtype of NoneType
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop data = fetcher.fetch(index) File "/home/skr/anaconda3/envs/env3.7/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch return self.collate_fn(data) File "/tmp/ipykernel_16029/1817995213.py", line 186, in prepare_sample raise Exception("Label encoder found an unknown label.") Exception: Label encoder found an unknown label.
Here is my
conda list