When training interrupts, sometimes model can't restore weights back with BackupAndRestore callback.
Traceback (most recent call last):
File "/home/alex/jupyter/lab/model_fba.py", line 150, in <module>
model.fit(train_dataset, callbacks=callbacks, epochs=NUM_EPOCHS, steps_per_epoch=STEPS_PER_EPOCH, verbose=2)
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 113, in error_handler
return fn(*args, **kwargs)
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 311, in fit
callbacks.on_train_begin()
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/callbacks/callback_list.py", line 218, in on_train_begin
callback.on_train_begin(logs)
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/callbacks/backup_and_restore.py", line 116, in on_train_begin
self.model.load_weights(self._weights_path)
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 113, in error_handler
return fn(*args, **kwargs)
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/models/model.py", line 353, in load_weights
saving_api.load_weights(
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/saving/saving_api.py", line 251, in load_weights
saving_lib.load_weights_only(
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/saving/saving_lib.py", line 550, in load_weights_only
weights_store = H5IOStore(filepath, mode="r")
File "/home/alex/.local/lib/python3.10/site-packages/keras/src/saving/saving_lib.py", line 931, in __init__
self.h5_file = h5py.File(root_path, mode=self.mode)
File "/home/alex/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 561, in __init__
fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
File "/home/alex/.local/lib/python3.10/site-packages/h5py/_hl/files.py", line 235, in make_fid
fid = h5f.open(name, flags, fapl=fapl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5f.pyx", line 102, in h5py.h5f.open
OSError: Unable to synchronously open file (bad object header version number)
When training interrupts, sometimes model can't restore weights back with BackupAndRestore callback.