Open calvinp0 opened 1 year ago
When the error occurs, the following information is applicable:
spc.label = 'r_94_[CH2]CCC'
job_name = conformer3
self.running_jobs[spc.label] = ['conformer3', 'conformer4', 'conformer5', 'conformer6', 'conformer7', 'conformer8', 'conformer0', 'conformer1', 'conformer2', 'conformer0']
self.job_dict[spc.label]['conformers'] = {'conformers':{0: <arc.job.adapters...>}}
As you can see, there is only one conformer in the job dict whilst there are multiple in the running jobs.
This is occuring during the updating of the restart_dict and saving of the restart.yml file
To add to this, all the conformer folders do exist, and conformer0 is the only one incomplete, as it is still running. So it appears there is a logic issue. Possibly when the script errors, all conformers were running, but since the restart, the conformer jobs have completed (except for conformer0) and thus, the error?
My current understanding:
In the scheduler.py
, there is a code block (line - 1743)
if self.species_dict[label].initial_xyz is None and self.species_dict[label].final_xyz is None \
and not self.testing:
if len(self.species_dict[label].conformers) > 1:
self.job_dict[label]['conformers'] = dict()
for i, xyz in enumerate(self.species_dict[label].conformers):
self.run_job(label=label,
xyz=xyz,
level_of_theory=self.conformer_level,
job_type='conformers',
conformer=i,
)
As we can see,if the 'conformers' value in the species_dict[label] is larger than one, the self.job_dict[label]['conformers'] is reset. Then, it will enumerate through the species_dict[label].conformers, getting each conformer at one of time.
However, during the enumeration loop - it will run the function self.run_job
- which leads to this function:
def run_job(self,
job_type: str,
conformer: Optional[int] = None,
cpu_cores: Optional[int] = None,
dihedral_increment: Optional[float] = None,
dihedrals: Optional[list] = None,
directed_scan_type: Optional[str] = None,
ess_trsh_methods: Optional[list] = None,
fine: Optional[bool] = False,
irc_direction: Optional[str] = None,
job_adapter: Optional[str] = None,
label: Optional[str] = None,
level_of_theory: Optional[Union[Level, dict, str]] = None,
memory: Optional[int] = None,
max_job_time: Optional[int] = None,
rotor_index: Optional[int] = None,
reactions: Optional[List['ARCReaction']] = None,
scan_trsh: Optional[str] = '',
shift: Optional[str] = '',
trsh: Optional[str] = '',
torsions: Optional[List[List[int]]] = None,
times_rerun: int = 0,
tsg: Optional[int] = None,
xyz: Optional[dict] = None,
):
"""
A helper function for running (all) jobs.
Args:
job_type (str): The type of job to run.
conformer (int, optional): Conformer number if optimizing conformers.
cpu_cores (int, optional): The total number of cpu cores requested for a job.
dihedral_increment (float, optional): The degrees increment to use when scanning dihedrals of TS guesses.
dihedrals (list, optional): The dihedral angles of a directed scan job corresponding to ``torsions``.
directed_scan_type (str, optional): The type of the directed scan.
ess_trsh_methods (list, optional): A list of troubleshooting methods already tried out for ESS convergence.
fine (bool, optional): Whether to run an optimization job with a fine grid. `True` to use fine.
irc_direction (str, optional): The direction to run the IRC computation.
job_adapter (str, optional): An ESS software to use.
label (str, optional): The species label.
level_of_theory (Level, optional): The level of theory to use.
memory (int, optional): The total job allocated memory in GB.
max_job_time (int, optional): The maximal allowed job time on the server in hours.
rotor_index (int, optional): The 0-indexed rotor number (key) in the species.rotors_dict dictionary.
reactions (List[ARCReaction], optional): Entries are ARCReaction instances, used for TS search methods.
scan_trsh (str, optional): A troubleshooting method for rotor scans.
shift (str, optional): A string representation alpha- and beta-spin orbitals shifts (molpro only).
times_rerun (int, optional): Number of times this job was re-run with the same arguments (no trsh methods).
torsions (List[List[int]], optional): The 0-indexed atom indices of the torsion(s).
trsh (str, optional): A troubleshooting keyword to be used in input files.
tsg (int, optional): TSGuess number if optimizing TS guesses.
xyz (dict, optional): The 3D coordinates for the species.
"""
max_job_time = max_job_time or self.max_job_time # if it's None, set to default
ess_trsh_methods = ess_trsh_methods if ess_trsh_methods is not None else list()
species = self.species_dict[label] if label is not None else None
memory = memory if memory is not None else self.memory
checkfile = self.species_dict[label].checkfile if label is not None else None
if torsions is None and rotor_index is not None:
torsions = species.rotors_dict[rotor_index]['torsion']
torsions = [torsions] if not isinstance(torsions[0], list) else torsions
if self.adaptive_levels is not None and label is not None:
level_of_theory = self.determine_adaptive_level(original_level_of_theory=level_of_theory, job_type=job_type,
heavy_atoms=self.species_dict[label].number_of_heavy_atoms)
job_adapter = job_adapter.lower() if job_adapter is not None else \
self.deduce_job_adapter(level=Level(repr=level_of_theory), job_type=job_type)
args = {'keyword': {}, 'block': {}}
if trsh:
args['trsh'] = {'trsh': trsh}
if shift:
args['shift'] = shift
if scan_trsh:
args['keyword']['scan_trsh'] = scan_trsh
if isinstance(level_of_theory, Level) and level_of_theory.args is not None:
args.update(level_of_theory.args)
job = job_factory(job_adapter=job_adapter,
project=self.project,
project_directory=self.project_directory,
job_type=job_type,
level=Level(repr=level_of_theory) if level_of_theory is not None else None,
args=args,
bath_gas=self.bath_gas,
checkfile=checkfile,
conformer=conformer,
constraints=None,
cpu_cores=cpu_cores,
dihedral_increment=dihedral_increment,
dihedrals=dihedrals,
directed_scan_type=directed_scan_type,
ess_settings=self.ess_settings,
ess_trsh_methods=ess_trsh_methods,
execution_type='incore' if job_adapter in default_incore_adapters else 'queue',
fine=fine,
irc_direction=irc_direction,
job_memory_gb=memory,
max_job_time=max_job_time,
reactions=[reactions] if reactions is not None and not isinstance(reactions, list) else reactions,
rotor_index=rotor_index,
server_nodes=None,
species=[species] if species is not None and not isinstance(species, list) else species,
times_rerun=times_rerun,
torsions=torsions,
tsg=tsg,
xyz=xyz,
)
label = label or reactions[0].ts_species.label
if label not in self.job_dict.keys():
self.job_dict[label] = dict()
if conformer is None and tsg is None:
# this is NOT a conformer DFT job nor a TS guess job
self.running_jobs[label] = list() if label not in self.running_jobs else self.running_jobs[label]
self.running_jobs[label].append(job.job_name) # mark as a running job
if job_type not in self.job_dict[label].keys():
# Jobs of this type haven't been spawned for label
self.job_dict[label][job_type] = dict()
self.job_dict[label][job_type][job.job_name] = job
elif conformer is not None:
# Running a conformer DFT job. Append differently to job_dict.
self.running_jobs[label] = list() if label not in self.running_jobs else self.running_jobs[label]
self.running_jobs[label].append(f'conformer{conformer}') # mark as a running job
if 'conformers' not in self.job_dict[label]:
self.job_dict[label]['conformers'] = dict()
self.job_dict[label]['conformers'][conformer] = job # save job object
elif tsg is not None:
# Running a TS guess job. Append differently to job_dict.
self.running_jobs[label] = list() if label not in self.running_jobs else self.running_jobs[label]
self.running_jobs[label].append(f'tsg{tsg}') # mark as a running job
if 'tsg' not in self.job_dict[label]:
self.job_dict[label]['tsg'] = dict()
self.job_dict[label]['tsg'][tsg] = job # save job object
if job.server is not None and job.server not in self.servers:
self.servers.append(job.server)
job.execute()
self.save_restart_dict()
Now, in this function, it will append the first iteration of the conformer
to self.job_dict[label]['conformers'] as a job
. Then it will do self.save_restart_dict
def save_restart_dict(self):
"""
Update the restart_dict and save the restart.yml file.
"""
if self.save_restart and self.restart_dict is not None:
logger.debug('Creating a restart file...')
self.restart_dict['output'] = self.output
self.restart_dict['species'] = [spc.as_dict() for spc in self.species_dict.values()]
self.restart_dict['running_jobs'] = dict()
for spc in self.species_dict.values():
if spc.label in self.running_jobs:
self.restart_dict['running_jobs'][spc.label] = \
[self.job_dict[spc.label][job_name.rsplit('_', 1)[0]][job_name].as_dict()
for job_name in self.running_jobs[spc.label]
if 'conformer' not in job_name and 'tsg' not in job_name] \
+ [self.job_dict[spc.label]['conformers'][get_i_from_job_name(job_name)].as_dict()
for job_name in self.running_jobs[spc.label] if 'conformer' in job_name] \
+ [self.job_dict[spc.label]['tsg'][get_i_from_job_name(job_name)].as_dict()
for job_name in self.running_jobs[spc.label] if 'tsg' in job_name]
logger.debug(f'Dumping restart dictionary:\n{self.restart_dict}')
save_yaml_file(path=self.restart_path, content=self.restart_dict)
And the issue occurs here. It will attempt to run through the all the conformers from the self.running_jobs
in the self.job_dict
. And since self.job_dict
has been reset and only has the first conformer iterations appended, there will be a KeyError
when a conformer
from self.running_jobs
is used as a key
Describe the bug After the error received in #631, I attempted an
arcrestart
and get the following errorAttached is the restart file.
restart.zip