feat(companion): Add parent-death cleanup for manager and companions

Stop orphaned processes from lingering when their parent dies.

set_parent_death_signal arms Linux prctl(PR_SET_PDEATHSIG) so a process is
signalled the moment its parent exits, returning False off Linux so callers
fall back to polling getppid.

The manager records its parent pid, arms a SIGTERM parent-death signal, and
checks getppid each tick: if the arbiter dies, the manager stops its companions
and exits instead of running on under a dead arbiter. Each companion arms the
same parent-death signal and rechecks getppid right after the fork, exiting if
the manager already died before the signal was armed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Tanmoy Sarkar 2026-06-09 22:56:21 +05:30
parent f21d0310be
commit 9c4d81726d
3 changed files with 65 additions and 2 deletions

View File

@ -693,7 +693,7 @@ No per-companion logic in Arbiter.
- [x] Wire Gunicorn reload to manager `reread` or restart.
- [x] Close Gunicorn-only fds in manager child.
- [x] Close manager-only fds in companion child.
- [ ] Add parent-death cleanup.
- [x] Add parent-death cleanup.
- [ ] Add lifecycle logs.
- [ ] Add tests for config validation.
- [ ] Add tests for state transitions.

View File

@ -5,10 +5,12 @@
from __future__ import annotations
import ctypes
import importlib
import os
import select
import signal
import sys
import time
from typing import TYPE_CHECKING, Callable, Iterable, Union
@ -19,6 +21,26 @@ from gunicorn.companion.process import CompanionProcess, State
if TYPE_CHECKING:
from gunicorn.companion.config import CompanionConfig
# prctl option number for "send me this signal when my parent dies".
PR_SET_PDEATHSIG = 1
def set_parent_death_signal(stop_signal) -> bool:
"""Ask the kernel to send ``stop_signal`` when this process's parent dies.
Uses Linux ``prctl(PR_SET_PDEATHSIG)`` so an orphaned manager or companion
is signalled the moment its parent goes away, rather than lingering. Returns
True when armed and False on any non-Linux platform or error, so callers can
fall back to polling ``os.getppid()``.
"""
if not sys.platform.startswith("linux"):
return False
try:
libc = ctypes.CDLL("libc.so.6", use_errno=True)
return libc.prctl(PR_SET_PDEATHSIG, int(stop_signal), 0, 0, 0) == 0
except (OSError, AttributeError):
return False
class CompanionManager:
"""Forks and supervises companion processes.
@ -41,6 +63,7 @@ class CompanionManager:
self.control = None
self.stopping = False
self._wakeup_pipe = None
self.parent_pid = None
def run(self) -> None:
"""Run the manager's supervision loop. This is the forked child body.
@ -51,8 +74,14 @@ class CompanionManager:
companions down and returns. Each tick reaps exited companions,
retries any that are backing off, promotes those past ``startsecs``,
and kills any that overran their stop deadline.
If the arbiter dies, the manager stops too: it arms a parent-death
signal on Linux and, as a portable fallback, watches ``getppid`` each
tick so it never keeps companions running under a dead arbiter.
"""
self.parent_pid = os.getppid()
self._install_signals()
set_parent_death_signal(signal.SIGTERM)
if self.control is not None:
self.control.create()
for process in self.processes.values():
@ -60,6 +89,9 @@ class CompanionManager:
self.log.info("companion manager running (pid %s)", self.pid)
try:
while not self.stopping:
if self._parent_gone():
self.log.info("companion manager parent gone, stopping")
break
self._tick()
self._wait()
self.stop_all()
@ -67,6 +99,10 @@ class CompanionManager:
if self.control is not None:
self.control.close()
def _parent_gone(self) -> bool:
"""True once the arbiter that forked the manager has exited."""
return os.getppid() != self.parent_pid
def _tick(self, now: float = None) -> None:
"""One supervision pass over every companion."""
now = now or time.time()
@ -283,6 +319,10 @@ class CompanionManager:
try:
self._close_manager_fds()
set_parent_death_signal(signal.SIGTERM)
if os.getppid() != self.pid:
# Manager already died between fork and arming: do not run.
os._exit(0)
self._apply_environment(process.config)
self._redirect_output(process.config)
target = self._resolve_target(process.config.target)

View File

@ -9,7 +9,7 @@ from unittest import mock
import pytest
from gunicorn.companion.control import CommandError
from gunicorn.companion.manager import CompanionManager
from gunicorn.companion.manager import CompanionManager, set_parent_death_signal
from gunicorn.companion.config import CompanionConfig
from gunicorn.companion.process import State
@ -40,6 +40,29 @@ def test_resolve_target_rejects_bad_string():
CompanionManager._resolve_target("no_colon")
def test_set_parent_death_signal_noop_off_linux():
with mock.patch("sys.platform", "darwin"):
assert set_parent_death_signal(signal.SIGTERM) is False
def test_set_parent_death_signal_arms_on_linux():
libc = mock.Mock()
libc.prctl.return_value = 0
with mock.patch("sys.platform", "linux"), \
mock.patch("ctypes.CDLL", return_value=libc):
assert set_parent_death_signal(signal.SIGTERM) is True
libc.prctl.assert_called_once()
def test_parent_gone_detects_reparenting():
manager = make_manager("rq")
manager.parent_pid = 4242
with mock.patch("os.getppid", return_value=4242):
assert manager._parent_gone() is False
with mock.patch("os.getppid", return_value=1):
assert manager._parent_gone() is True
def test_close_manager_fds_closes_control_and_pipe():
manager = make_manager("rq")
manager.control = mock.Mock()