pyiron stores data in a combination of HDF5 files and an SQL database. The HDF5 files are used to store the data and the SQL database as index of all HDF5 files distributed over the filesystem. This index consists of a single SQL table, similar to the table of the pr.job_table() function.
When you move HDF5 files created by pyiron on the command line or remove them, then the pyiron SQL database is not aware of these changes, resulting in an inconsistent database.
To identify files which are missing in the database and which files are stored on the file system but are not included in the database, the following code snippet can be used. Here the project_path='../..' specifies the directory to compare:
from h5io import read_hdf5
import os
import pandas
import posixpath
from pyfileindex import PyFileIndex
from pyiron_base import Project, state
def filter_function(file_name):
return '.h5' in file_name
def split_path(full_path):
root = state.database.top_path(full_path=full_path)
if root is not None:
rel_path = posixpath.relpath(full_path, root)
else:
rel_path = full_path
return root, rel_path
def get_file_name(project_path, project, job_name):
if project_path is None:
return os.path.join(project, job_name + ".h5")
else:
return os.path.join(project_path, project, job_name + ".h5")
def get_file_lists(project_path):
pfi = PyFileIndex(path=project_path, filter_function=filter_function)
df_files = pfi.dataframe[~pfi.dataframe.is_directory]
file_filesystem_lst = []
for filename in df_files["path"].values:
try:
job_name = os.path.basename(filename)[:-3]
read_hdf5(fname=filename, title= job_name + "/status", slash='ignore')
file_filesystem_lst.append(filename)
except ValueError:
pass
df = Project(project_path).job_table()
file_db_lst = [
get_file_name(project_path=job[0], project=job[1], job_name=job[2])
for job in zip(df["projectpath"].values, df["project"].values, df["job"].values)
]
return file_db_lst, file_filesystem_lst
def get_differences(file_db_lst, file_filesystem_lst):
files_in_db_not_on_filesystem = set(file_db_lst) - set(file_filesystem_lst)
files_on_filesystem_not_in_db = set(file_filesystem_lst) - set(file_db_lst)
return files_in_db_not_on_filesystem, files_on_filesystem_not_in_db
file_db_lst, file_filesystem_lst = get_file_lists(project_path='../..')
get_differences(file_db_lst=file_db_lst, file_filesystem_lst=file_filesystem_lst)
pyiron stores data in a combination of HDF5 files and an SQL database. The HDF5 files are used to store the data and the SQL database as index of all HDF5 files distributed over the filesystem. This index consists of a single SQL table, similar to the table of the
pr.job_table()
function.When you move HDF5 files created by pyiron on the command line or remove them, then the pyiron SQL database is not aware of these changes, resulting in an inconsistent database.
To identify files which are missing in the database and which files are stored on the file system but are not included in the database, the following code snippet can be used. Here the
project_path='../..'
specifies the directory to compare: