python / cpython

The Python programming language
https://www.python.org
Other
63.71k stars 30.53k forks source link

`asyncio.subprocess.Process` race condition kills an unrelated process on Unix #127049

Open gschaffner opened 1 week ago

gschaffner commented 1 week ago

Bug report

Bug description:

On non-Windows platforms, there appears to be a race condition around asyncio's PID lifetime management that results in asyncio.subprocess.Process.[send_signal | terminate | kill] signaling the wrong process, by calling kill on an already-freed PID. I observed this behavior occurring in the wild and I suspect this race condition was the underlying cause.

A reproducer that consistently hits the race condition on 3.13.0, 3.14.0a1, main:

# N.B.: We apply the monkeypatch before subprocess is imported because subprocess will
# hold strong references to os.waitpid.
from __future__ import annotations

import os
import sys
import textwrap
import traceback
from functools import wraps

orig_waitpid = os.waitpid
orig_kill = os.kill
freed_pids = set[int]()

@wraps(orig_waitpid)
def waitpid(pid: int, options: int, /) -> tuple[int, int]:
    print(f"--DBG: start  waitpid({pid!r}, {options!r}) @")
    print(
        textwrap.indent(
            "".join(traceback.extract_stack(sys._getframe(1), limit=2).format()),
            prefix=" " * (-2 + len("--DBG: ")),
        ),
        end="",
    )
    try:
        res = orig_waitpid(pid, options)
    except BaseException as exc:
        print(f"--DBG: finish waitpid({pid!r}, {options!r}) -> {exc!r}")
        raise
    else:
        res_pid, status = res
        if res_pid != 0:
            freed_pids.add(res_pid)
        print(f"--DBG: finish waitpid({pid!r}, {options!r}) = {res!r}")
        return res

@wraps(orig_kill)
def kill(pid: int, sig: int, /) -> None:
    print(f"--DBG: kill({pid}, {sig})")
    if pid in freed_pids:
        raise ValueError(
            "caller is trying to signal an already-freed PID! did a site call waitpid without telling the sites with references to that PID about it?"
        )
    return orig_kill(pid, sig)

os.waitpid = waitpid
os.kill = kill

assert "subprocess" not in sys.modules

import asyncio
import subprocess
from signal import Signals as Signal
from typing import Literal
from typing import assert_never

async def main() -> None:
    _watcher_case: Literal["_PidfdChildWatcher", "_ThreadedChildWatcher"]
    if sys.version_info >= (3, 14):
        _watcher = asyncio.get_running_loop()._watcher  # type: ignore[attr-defined]
        if isinstance(_watcher, asyncio.unix_events._PidfdChildWatcher):  # type: ignore[attr-defined]
            _watcher_case = "_PidfdChildWatcher"
        elif isinstance(_watcher, asyncio.unix_events._ThreadedChildWatcher):  # type: ignore[attr-defined]
            _watcher_case = "_ThreadedChildWatcher"
        else:
            raise NotImplementedError()
    else:
        _watcher = asyncio.get_child_watcher()
        if isinstance(_watcher, asyncio.PidfdChildWatcher):
            _watcher_case = "_PidfdChildWatcher"
        elif isinstance(_watcher, asyncio.ThreadedChildWatcher):
            _watcher_case = "_ThreadedChildWatcher"
        else:
            raise NotImplementedError()
    print(f"{_watcher_case = !r}")

    process = await asyncio.create_subprocess_exec(
        "python",
        "-c",
        "import time; time.sleep(1)",
        stdin=subprocess.DEVNULL,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    print(f"{process.pid = !r}")

    process.send_signal(Signal.SIGKILL)

    # This snippet is contrived, in order to make this snippet hit the race condition
    # consistently for reproduction & testing purposes.
    if _watcher_case == "_PidfdChildWatcher":
        os.waitid(os.P_PID, process.pid, os.WEXITED | os.WNOWAIT)
        # Or alternatively, time.sleep(0.1).

        # On the next loop cycle asyncio will select on the pidfd and append the reader
        # callback:
        await asyncio.sleep(0)
        # On the next loop cycle the reader callback will run, calling (a) waitpid
        # (freeing the PID) and (b) call_soon_threadsafe(transport._process_exited):
        await asyncio.sleep(0)

        # The _PidfdChildWatcher has now freed the PID but hasn't yet told the
        # asyncio.subprocess.Process or the subprocess.Popen about this
        # (call_soon_threadsafe).
    elif _watcher_case == "_ThreadedChildWatcher":
        if (thread := _watcher._threads.get(process.pid)) is not None:  # type: ignore[attr-defined]
            thread.join()
        # Or alternatively, time.sleep(0.1).

        # The _ThreadedChildWatcher has now freed the PID but hasn't yet told the
        # asyncio.subprocess.Process or the subprocess.Popen about this
        # (call_soon_threadsafe).
    else:
        assert_never(_watcher_case)

    # The watcher has now freed the PID but hasn't yet told the
    # asyncio.subprocess.Process or the subprocess.Popen that the PID they hold a
    # reference to has been freed externally!
    #
    # I think these two things need to happen atomically.

    try:
        process.send_signal(Signal.SIGKILL)
    except ProcessLookupError:
        pass

# Pretend we don't have pidfd support
# if sys.version_info >= (3, 14):
#     asyncio.unix_events.can_use_pidfd = lambda: False  # type: ignore[attr-defined]
# else:
#     asyncio.set_child_watcher(asyncio.ThreadedChildWatcher())

asyncio.run(main())

This bisects to GH-121126. GH-121126 fixed GH-87744 (where asyncio (on _ThreadedChildWatcher only) and Popen raced waitpid calls) by removing the waitpid from Process.send_signal. The problem with this is that now Process.send_signal calls kill on a PID that may have already been freed by asyncio (on either _PidfdChildWatcher or _ThreadedChildWatcher). GH-82811 is analogous; IIUC GH-121126 inadvertently regressed GH-82811 except for processes spawned via asyncio rather than those spawned via subprocess.

CPython versions tested on:

3.13, 3.14, CPython main branch

Operating systems tested on:

Linux

Linked PRs

asvetlov commented 1 week ago

Thanks for the report! I can reproduce the error locally on my linux box