materialsvirtuallab / matgl

Graph deep learning library for materials
BSD 3-Clause "New" or "Revised" License
232 stars 57 forks source link

[Bug]: Getting ValueError due to run "Training a M3GNet Formation Energy Model with PyTorch Lightning.ipynb" without any change in code. #245

Closed Sunests closed 3 months ago

Sunests commented 3 months ago

Email (Optional)

kirill.sokolski3007@gmail.com

Version

1.0.0

Which OS(es) are you using?

What happened?

Run file

matgl/examples/Training a M3GNet Formation Energy Model with PyTorch Lightning.ipynb

in block where trainer.fit(...). The only change in code is !pip install matgl in the top (using Google Colab).

Code snippet

No response

Log output

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              | Params
--------------------------------------------
0 | model | M3GNet            | 388 K 
1 | mae   | MeanAbsoluteError | 0     
2 | rmse  | MeanSquaredError  | 0     
--------------------------------------------
388 K     Trainable params
0         Non-trainable params
388 K     Total params
1.553     Total estimated model params size (MB)
Sanity Checking: 
 0/? [00:00<?, ?it/s]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-813a67575024> in <cell line: 3>()
      1 logger = CSVLogger("logs", name="M3GNet_training")
      2 trainer = pl.Trainer(max_epochs=20, accelerator="cpu", logger=logger)
----> 3 trainer.fit(model=lit_module, train_dataloaders=train_loader, val_dataloaders=val_loader)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    542         self.state.status = TrainerStatus.RUNNING
    543         self.training = True
--> 544         call._call_and_handle_interrupt(
    545             self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    546         )

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     42         if trainer.strategy.launcher is not None:
     43             return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 44         return trainer_fn(*args, **kwargs)
     45 
     46     except _TunerExitException:

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    578             model_connected=self.lightning_module is not None,
    579         )
--> 580         self._run(model, ckpt_path=ckpt_path)
    581 
    582         assert self.state.stopped

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
    985         # RUN THE TRAINER
    986         # ----------------------------
--> 987         results = self._run_stage()
    988 
    989         # ----------------------------

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py in _run_stage(self)
   1029         if self.training:
   1030             with isolate_rng():
-> 1031                 self._run_sanity_check()
   1032             with torch.autograd.set_detect_anomaly(self._detect_anomaly):
   1033                 self.fit_loop.run()

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py in _run_sanity_check(self)
   1058 
   1059             # run eval step
-> 1060             val_loop.run()
   1061 
   1062             call._call_callback_hooks(self, "on_sanity_check_end")

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py in _decorator(self, *args, **kwargs)
    180             context_manager = torch.no_grad
    181         with context_manager():
--> 182             return loop_run(self, *args, **kwargs)
    183 
    184     return _decorator

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/evaluation_loop.py in run(self)
    126                 else:
    127                     dataloader_iter = None
--> 128                     batch, batch_idx, dataloader_idx = next(data_fetcher)
    129                 if previous_dataloader_idx != dataloader_idx:
    130                     # the dataloader has changed, notify the logger connector

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fetchers.py in __next__(self)
    131         elif not self.done:
    132             # this will run only when no pre-fetching was done.
--> 133             batch = super().__next__()
    134         else:
    135             # the iterator is empty

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fetchers.py in __next__(self)
     58         self._start_profiler()
     59         try:
---> 60             batch = next(self.iterator)
     61         except StopIteration:
     62             self.done = True

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/combined_loader.py in __next__(self)
    339     def __next__(self) -> _ITERATOR_RETURN:
    340         assert self._iterator is not None
--> 341         out = next(self._iterator)
    342         if isinstance(self._iterator, _Sequential):
    343             return out

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/combined_loader.py in __next__(self)
    140 
    141         try:
--> 142             out = next(self.iterators[0])
    143         except StopIteration:
    144             # try the next iterator

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    629                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    630                 self._reset()  # type: ignore[call-arg]
--> 631             data = self._next_data()
    632             self._num_yielded += 1
    633             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
   1344             else:
   1345                 del self._task_info[idx]
-> 1346                 return self._process_data(data)
   1347 
   1348     def _try_put_index(self):

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _process_data(self, data)
   1370         self._try_put_index()
   1371         if isinstance(data, ExceptionWrapper):
-> 1372             data.reraise()
   1373         return data
   1374 

/usr/local/lib/python3.10/dist-packages/torch/_utils.py in reraise(self)
    720             # instantiate since we don't know how to
    721             raise RuntimeError(msg) from None
--> 722         raise exception
    723 
    724 

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.10/dist-packages/matgl/graph/data.py", line 28, in collate_fn
    graphs, lattices, line_graphs, state_attr, labels = map(list, zip(*batch))
ValueError: not enough values to unpack (expected 5, got 4)

Code of Conduct

kenko911 commented 3 months ago

HI, @Sunests, thank you for reporting this issue, we recently modified the MGLdataset and Lightning modules for M3GNet training by setting include_line_graph=True. Please have a look at the jupyter-notebook for M3GNet property model training in the example directory from the latest version of MatGL.

Sunests commented 3 months ago

HI, @Sunests, thank you for reporting this issue, we recently modified the MGLdataset and Lightning modules for M3GNet training by setting include_line_graph=True. Please have a look at the jupyter-notebook for M3GNet property model training in the example directory from the latest version of MatGL.

Thank you!