termination_strategy for Scheduler.with_processes() uses psutil to send a SIGTERM and then a SIGKILL to the process. It's possible the process ends before it's killed, causing a race condition, leading to an unexpected scheduler crash. This is highly unlikely but some care needs to be taken when doing the below to ensure we wrap this in a try: except so that we only construct the Process if it still exists.
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/termination_strategies.py", line 38, in _terminate_with_psutil
worker_processes = [psutil.Process(p.pid) for p in executor._processes.values()]
Traceback (most recent call last):
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1653, in wrapper
return fun(self, *args, **kwargs)
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_common.py", line 480, in wrapper
raise raise_from(err, None)
File "<string>", line 3, in raise_from
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_common.py", line 478, in wrapper
return fun(self)
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1695, in _parse_stat_file
data = bcat("%s/%s/stat" % (self._procfs_path, self.pid))
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_common.py", line 813, in bcat
return cat(fname, fallback=fallback, _open=open_binary)
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_common.py", line 801, in cat
with _open(fname) as f:
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_common.py", line 765, in open_binary
return open(fname, "rb", buffering=FILE_READ_BUFFER_SIZE)
FileNotFoundError: [Errno 2] No such file or directory: '/proc/2888832/stat'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/__init__.py", line 361, in _init
self.create_time()
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/__init__.py", line 719, in create_time
self._create_time = self._proc.create_time()
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1653, in wrapper
return fun(self, *args, **kwargs)
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1863, in create_time
ctime = float(self._parse_stat_file()['create_time'])
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1660, in wrapper
raise NoSuchProcess(self.pid, self._name)
psutil.NoSuchProcess: process no longer exists (pid=2888832)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/work/dlclarge2/bergmane-pipeline-exps/exps/src/exps/experiments/exp1.py", line 177, in run_it
history = pipeline.optimize(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/pipeline/node.py", line 1503, in optimize
scheduler.run(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/scheduler.py", line 1411, in run
return asyncio.run(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/.eddie-venv/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/work/dlclarge2/bergmane-pipeline-exps/exps/.eddie-venv/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/scheduler.py", line 1580, in async_run
stop_reason = await self._run_scheduler(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/scheduler.py", line 1275, in _run_scheduler
self._terminate(self.executor)
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/termination_strategies.py", line 38, in _terminate_with_psutil
worker_processes = [psutil.Process(p.pid) for p in executor._processes.values()]
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/termination_strategies.py", line 38, in <listcomp>
worker_processes = [psutil.Process(p.pid) for p in executor._processes.values()]
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/__init__.py", line 332, in __init__
self._init(pid)
File "/home/bergmane/.local/lib/python3.10/site-packages/psutil/__init__.py", line 373, in _init
raise NoSuchProcess(pid, msg='process PID not found')
psutil.NoSuchProcess: process PID not found (pid=2888832)
On a related note, this also seems to occur:
Traceback (most recent call last):
File "/work/dlclarge2/bergmane-pipeline-exps/exps/src/exps/experiments/exp1.py", line 177, in run_it
history = pipeline.optimize(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/pipeline/node.py", line 1503, in optimize
scheduler.run(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/scheduler.py", line 1411, in run
return asyncio.run(
File "/work/dlclarge2/bergmane-pipeline-exps/exps/.eddie-venv/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/work/dlclarge2/bergmane-pipeline-exps/exps/.eddie-venv/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/work/dlclarge2/bergmane-pipeline-exps/exps/vendored/amltk/src/amltk/scheduling/scheduler.py", line 1611, in async_run
raise stop_reason
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
A process in the process pool was terminated abruptly while the future was running or pending.%
As a follow up, the broken pool was due to slurm killing the process which exceeded the memory limit. Not sure how to improve error messaging here but at least it's not specific to termination strategy
termination_strategy
forScheduler.with_processes()
usespsutil
to send aSIGTERM
and then aSIGKILL
to the process. It's possible the process ends before it's killed, causing a race condition, leading to an unexpected scheduler crash. This is highly unlikely but some care needs to be taken when doing the below to ensure we wrap this in atry: except
so that we only construct theProcess
if it still exists.On a related note, this also seems to occur: