On non-Windows platforms, there appears to be a race condition around asyncio's PID lifetime management that results in asyncio.subprocess.Process.[send_signal | terminate | kill] signaling the wrong process, by calling kill on an already-freed PID. I observed this behavior occurring in the wild and I suspect this race condition was the underlying cause.
A reproducer that consistently hits the race condition on 3.13.0, 3.14.0a1, main:
# N.B.: We apply the monkeypatch before subprocess is imported because subprocess will
# hold strong references to os.waitpid.
from __future__ import annotations
import os
import sys
import textwrap
import traceback
from functools import wraps
orig_waitpid = os.waitpid
orig_kill = os.kill
freed_pids = set[int]()
@wraps(orig_waitpid)
def waitpid(pid: int, options: int, /) -> tuple[int, int]:
print(f"--DBG: start waitpid({pid!r}, {options!r}) @")
print(
textwrap.indent(
"".join(traceback.extract_stack(sys._getframe(1), limit=2).format()),
prefix=" " * (-2 + len("--DBG: ")),
),
end="",
)
try:
res = orig_waitpid(pid, options)
except BaseException as exc:
print(f"--DBG: finish waitpid({pid!r}, {options!r}) -> {exc!r}")
raise
else:
res_pid, status = res
if res_pid != 0:
freed_pids.add(res_pid)
print(f"--DBG: finish waitpid({pid!r}, {options!r}) = {res!r}")
return res
@wraps(orig_kill)
def kill(pid: int, sig: int, /) -> None:
print(f"--DBG: kill({pid}, {sig})")
if pid in freed_pids:
raise ValueError(
"caller is trying to signal an already-freed PID! did a site call waitpid without telling the sites with references to that PID about it?"
)
return orig_kill(pid, sig)
os.waitpid = waitpid
os.kill = kill
assert "subprocess" not in sys.modules
import asyncio
import subprocess
from signal import Signals as Signal
from typing import Literal
from typing import assert_never
async def main() -> None:
_watcher_case: Literal["_PidfdChildWatcher", "_ThreadedChildWatcher"]
if sys.version_info >= (3, 14):
_watcher = asyncio.get_running_loop()._watcher # type: ignore[attr-defined]
if isinstance(_watcher, asyncio.unix_events._PidfdChildWatcher): # type: ignore[attr-defined]
_watcher_case = "_PidfdChildWatcher"
elif isinstance(_watcher, asyncio.unix_events._ThreadedChildWatcher): # type: ignore[attr-defined]
_watcher_case = "_ThreadedChildWatcher"
else:
raise NotImplementedError()
else:
_watcher = asyncio.get_child_watcher()
if isinstance(_watcher, asyncio.PidfdChildWatcher):
_watcher_case = "_PidfdChildWatcher"
elif isinstance(_watcher, asyncio.ThreadedChildWatcher):
_watcher_case = "_ThreadedChildWatcher"
else:
raise NotImplementedError()
print(f"{_watcher_case = !r}")
process = await asyncio.create_subprocess_exec(
"python",
"-c",
"import time; time.sleep(1)",
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
print(f"{process.pid = !r}")
process.send_signal(Signal.SIGKILL)
# This snippet is contrived, in order to make this snippet hit the race condition
# consistently for reproduction & testing purposes.
if _watcher_case == "_PidfdChildWatcher":
os.waitid(os.P_PID, process.pid, os.WEXITED | os.WNOWAIT)
# Or alternatively, time.sleep(0.1).
# On the next loop cycle asyncio will select on the pidfd and append the reader
# callback:
await asyncio.sleep(0)
# On the next loop cycle the reader callback will run, calling (a) waitpid
# (freeing the PID) and (b) call_soon_threadsafe(transport._process_exited):
await asyncio.sleep(0)
# The _PidfdChildWatcher has now freed the PID but hasn't yet told the
# asyncio.subprocess.Process or the subprocess.Popen about this
# (call_soon_threadsafe).
elif _watcher_case == "_ThreadedChildWatcher":
if (thread := _watcher._threads.get(process.pid)) is not None: # type: ignore[attr-defined]
thread.join()
# Or alternatively, time.sleep(0.1).
# The _ThreadedChildWatcher has now freed the PID but hasn't yet told the
# asyncio.subprocess.Process or the subprocess.Popen about this
# (call_soon_threadsafe).
else:
assert_never(_watcher_case)
# The watcher has now freed the PID but hasn't yet told the
# asyncio.subprocess.Process or the subprocess.Popen that the PID they hold a
# reference to has been freed externally!
#
# I think these two things need to happen atomically.
try:
process.send_signal(Signal.SIGKILL)
except ProcessLookupError:
pass
# Pretend we don't have pidfd support
# if sys.version_info >= (3, 14):
# asyncio.unix_events.can_use_pidfd = lambda: False # type: ignore[attr-defined]
# else:
# asyncio.set_child_watcher(asyncio.ThreadedChildWatcher())
asyncio.run(main())
This bisects to GH-121126. GH-121126 fixed GH-87744 (where asyncio (on _ThreadedChildWatcher only) and Popen raced waitpid calls) by removing the waitpid from Process.send_signal. The problem with this is that now Process.send_signal calls kill on a PID that may have already been freed by asyncio (on either _PidfdChildWatcher or _ThreadedChildWatcher). GH-82811 is analogous; IIUC GH-121126 inadvertently regressed GH-82811 except for processes spawned via asyncio rather than those spawned via subprocess.
Bug report
Bug description:
On non-Windows platforms, there appears to be a race condition around asyncio's PID lifetime management that results in
asyncio.subprocess.Process.[send_signal | terminate | kill]
signaling the wrong process, by callingkill
on an already-freed PID. I observed this behavior occurring in the wild and I suspect this race condition was the underlying cause.A reproducer that consistently hits the race condition on 3.13.0, 3.14.0a1, main:
This bisects to GH-121126. GH-121126 fixed GH-87744 (where asyncio (on
_ThreadedChildWatcher
only) andPopen
racedwaitpid
calls) by removing thewaitpid
fromProcess.send_signal
. The problem with this is that nowProcess.send_signal
callskill
on a PID that may have already been freed by asyncio (on either_PidfdChildWatcher
or_ThreadedChildWatcher
). GH-82811 is analogous; IIUC GH-121126 inadvertently regressed GH-82811 except for processes spawned viaasyncio
rather than those spawned viasubprocess
.CPython versions tested on:
3.13, 3.14, CPython main branch
Operating systems tested on:
Linux
Linked PRs