From 7f39839a8c32190da4881bf6ad02de48446ea6fa Mon Sep 17 00:00:00 2001 From: Tanmoy Sarkar Date: Sat, 13 Jun 2026 11:42:02 +0530 Subject: [PATCH] fix(companion): Back off manager respawn and quiet expected exits The arbiter respawned the companion manager on every main-loop tick once its pid cleared, so a manager that could not boot would busy-spin. It also logged every manager exit as an error, including the deliberate exits from shutdown and reload. Track whether a manager exit was on purpose: stop_companion_manager marks it expected and clears any backoff, so the reaper logs it as info and a reload respawns without delay. An unexpected exit now arms an exponential crash backoff (2^(n-1)s, capped at 30s) that the main loop waits out before respawning, and is logged as an error. Co-Authored-By: Claude Opus 4.8 (1M context) --- gunicorn/arbiter.py | 44 ++++++++++++++++++++++++++++++---- tests/test_arbiter.py | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index a1aa075d..3f04381f 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -36,6 +36,10 @@ class Arbiter: # A flag indicating if an application failed to be loaded APP_LOAD_ERROR = 4 + # Cap on the crash-backoff delay before respawning a companion manager + # that keeps exiting unexpectedly, so a crash loop cannot busy-spin. + COMPANION_MANAGER_MAX_RESPAWN_DELAY = 30 + START_CTX = {} LISTENERS = [] @@ -67,6 +71,14 @@ class Arbiter: self.master_pid = 0 self.master_name = "Master" self.companion_manager_pid = 0 + # True while a manager exit is expected (deliberate stop or reload), so + # the reaper logs it as info instead of an unexpected-crash error. + self._companion_manager_stopping = False + # Crash backoff: earliest monotonic time the main loop may respawn a + # manager that exited unexpectedly, plus the consecutive-crash count + # that sizes the delay. + self._companion_manager_respawn_at = 0 + self._companion_manager_failures = 0 # Configs of the currently running companion manager, cached at spawn so # shutdown can size its wait without re-reading the config file. self._companion_configs = [] @@ -539,8 +551,23 @@ class Arbiter: # The manager itself exited; clear its pid so the main # loop respawns it. It owns its companions' lifecycles. self.companion_manager_pid = 0 - self.log.error( - "Companion manager (pid:%s) exited", wpid) + if self._companion_manager_stopping: + # Expected exit from a deliberate stop or reload. + self._companion_manager_stopping = False + self.log.info( + "Companion manager (pid:%s) exited", wpid) + else: + # Unexpected crash: back off before respawning so a + # manager that cannot boot does not busy-spin. + self._companion_manager_failures += 1 + delay = min( + 2 ** (self._companion_manager_failures - 1), + self.COMPANION_MANAGER_MAX_RESPAWN_DELAY) + self._companion_manager_respawn_at = ( + time.monotonic() + delay) + self.log.error( + "Companion manager (pid:%s) exited unexpectedly; " + "respawning in %ss", wpid, delay) else: # A worker was terminated. If the termination reason was # that it could not boot, we'll shut it down to avoid @@ -674,8 +701,12 @@ class Arbiter: child of the arbiter; per-companion supervision lives entirely inside it, so the arbiter only ensures the one manager process exists. """ - if self.companion_manager_pid == 0 and self.cfg.companion_workers: - self.spawn_companion_manager() + if not (self.companion_manager_pid == 0 and self.cfg.companion_workers): + return + if time.monotonic() < self._companion_manager_respawn_at: + # Still inside the crash-backoff window; wait before respawning. + return + self.spawn_companion_manager() def spawn_companion_manager(self): """Fork the companion manager process. @@ -785,6 +816,11 @@ class Arbiter: """ if self.companion_manager_pid == 0: return + # This exit is on purpose: mark it expected and clear any crash backoff + # so the reaper logs info and a reload can respawn without delay. + self._companion_manager_stopping = True + self._companion_manager_failures = 0 + self._companion_manager_respawn_at = 0 try: os.kill(self.companion_manager_pid, sig) except OSError as e: diff --git a/tests/test_arbiter.py b/tests/test_arbiter.py index e647381f..c7747194 100644 --- a/tests/test_arbiter.py +++ b/tests/test_arbiter.py @@ -5,6 +5,7 @@ import errno import os import signal +import time from unittest import mock import gunicorn.app.base @@ -183,6 +184,61 @@ def test_arbiter_reap_clears_companion_manager_pid(mock_os_waitpid): assert arbiter.companion_manager_pid == 0 +@mock.patch('os.waitpid') +def test_arbiter_reap_unexpected_manager_exit_backs_off(mock_os_waitpid): + # An unexpected manager exit (no deliberate stop) is an error and arms the + # crash backoff so the main loop does not respawn it immediately. + mock_os_waitpid.side_effect = [(4242, 0), (0, 0)] + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.companion_manager_pid = 4242 + arbiter.log = mock.Mock() + arbiter.reap_workers() + assert arbiter.companion_manager_pid == 0 + assert arbiter._companion_manager_failures == 1 + assert arbiter._companion_manager_respawn_at > 0 + arbiter.log.error.assert_called_once() + arbiter.log.info.assert_not_called() + + +@mock.patch('os.waitpid') +def test_arbiter_reap_deliberate_manager_exit_is_info(mock_os_waitpid): + # A deliberate stop (stopping flag set) is an expected exit: logged as info + # with no backoff, so a reload respawns without delay. + mock_os_waitpid.side_effect = [(4242, 0), (0, 0)] + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.companion_manager_pid = 4242 + arbiter._companion_manager_stopping = True + arbiter.log = mock.Mock() + arbiter.reap_workers() + assert arbiter.companion_manager_pid == 0 + assert arbiter._companion_manager_stopping is False + assert arbiter._companion_manager_respawn_at == 0 + arbiter.log.info.assert_called_once() + arbiter.log.error.assert_not_called() + + +def test_arbiter_manage_companion_manager_waits_during_backoff(): + # While inside the crash-backoff window the manager is not respawned. + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.cfg.set("companion_workers", [{"name": "rq", "target": "pkg:run"}]) + arbiter._companion_manager_respawn_at = time.monotonic() + 60 + arbiter.spawn_companion_manager = mock.Mock() + arbiter.manage_companion_manager() + arbiter.spawn_companion_manager.assert_not_called() + + +def test_stop_companion_manager_marks_expected_and_clears_backoff(): + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.companion_manager_pid = 4242 + arbiter._companion_manager_failures = 3 + arbiter._companion_manager_respawn_at = time.monotonic() + 60 + with mock.patch("os.kill"): + arbiter.stop_companion_manager(signal.SIGTERM) + assert arbiter._companion_manager_stopping is True + assert arbiter._companion_manager_failures == 0 + assert arbiter._companion_manager_respawn_at == 0 + + def test_stop_companion_manager_signals_running(): arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) arbiter.companion_manager_pid = 4242