fix(companion): Back off manager respawn and quiet expected exits

The arbiter respawned the companion manager on every main-loop tick once
its pid cleared, so a manager that could not boot would busy-spin. It also
logged every manager exit as an error, including the deliberate exits from
shutdown and reload.

Track whether a manager exit was on purpose: stop_companion_manager marks
it expected and clears any backoff, so the reaper logs it as info and a
reload respawns without delay. An unexpected exit now arms an exponential
crash backoff (2^(n-1)s, capped at 30s) that the main loop waits out before
respawning, and is logged as an error.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Tanmoy Sarkar 2026-06-13 11:42:02 +05:30
parent 10a41a1cc5
commit 7f39839a8c
2 changed files with 96 additions and 4 deletions

View File

@ -36,6 +36,10 @@ class Arbiter:
# A flag indicating if an application failed to be loaded
APP_LOAD_ERROR = 4
# Cap on the crash-backoff delay before respawning a companion manager
# that keeps exiting unexpectedly, so a crash loop cannot busy-spin.
COMPANION_MANAGER_MAX_RESPAWN_DELAY = 30
START_CTX = {}
LISTENERS = []
@ -67,6 +71,14 @@ class Arbiter:
self.master_pid = 0
self.master_name = "Master"
self.companion_manager_pid = 0
# True while a manager exit is expected (deliberate stop or reload), so
# the reaper logs it as info instead of an unexpected-crash error.
self._companion_manager_stopping = False
# Crash backoff: earliest monotonic time the main loop may respawn a
# manager that exited unexpectedly, plus the consecutive-crash count
# that sizes the delay.
self._companion_manager_respawn_at = 0
self._companion_manager_failures = 0
# Configs of the currently running companion manager, cached at spawn so
# shutdown can size its wait without re-reading the config file.
self._companion_configs = []
@ -539,8 +551,23 @@ class Arbiter:
# The manager itself exited; clear its pid so the main
# loop respawns it. It owns its companions' lifecycles.
self.companion_manager_pid = 0
self.log.error(
if self._companion_manager_stopping:
# Expected exit from a deliberate stop or reload.
self._companion_manager_stopping = False
self.log.info(
"Companion manager (pid:%s) exited", wpid)
else:
# Unexpected crash: back off before respawning so a
# manager that cannot boot does not busy-spin.
self._companion_manager_failures += 1
delay = min(
2 ** (self._companion_manager_failures - 1),
self.COMPANION_MANAGER_MAX_RESPAWN_DELAY)
self._companion_manager_respawn_at = (
time.monotonic() + delay)
self.log.error(
"Companion manager (pid:%s) exited unexpectedly; "
"respawning in %ss", wpid, delay)
else:
# A worker was terminated. If the termination reason was
# that it could not boot, we'll shut it down to avoid
@ -674,7 +701,11 @@ class Arbiter:
child of the arbiter; per-companion supervision lives entirely inside
it, so the arbiter only ensures the one manager process exists.
"""
if self.companion_manager_pid == 0 and self.cfg.companion_workers:
if not (self.companion_manager_pid == 0 and self.cfg.companion_workers):
return
if time.monotonic() < self._companion_manager_respawn_at:
# Still inside the crash-backoff window; wait before respawning.
return
self.spawn_companion_manager()
def spawn_companion_manager(self):
@ -785,6 +816,11 @@ class Arbiter:
"""
if self.companion_manager_pid == 0:
return
# This exit is on purpose: mark it expected and clear any crash backoff
# so the reaper logs info and a reload can respawn without delay.
self._companion_manager_stopping = True
self._companion_manager_failures = 0
self._companion_manager_respawn_at = 0
try:
os.kill(self.companion_manager_pid, sig)
except OSError as e:

View File

@ -5,6 +5,7 @@
import errno
import os
import signal
import time
from unittest import mock
import gunicorn.app.base
@ -183,6 +184,61 @@ def test_arbiter_reap_clears_companion_manager_pid(mock_os_waitpid):
assert arbiter.companion_manager_pid == 0
@mock.patch('os.waitpid')
def test_arbiter_reap_unexpected_manager_exit_backs_off(mock_os_waitpid):
# An unexpected manager exit (no deliberate stop) is an error and arms the
# crash backoff so the main loop does not respawn it immediately.
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter.log = mock.Mock()
arbiter.reap_workers()
assert arbiter.companion_manager_pid == 0
assert arbiter._companion_manager_failures == 1
assert arbiter._companion_manager_respawn_at > 0
arbiter.log.error.assert_called_once()
arbiter.log.info.assert_not_called()
@mock.patch('os.waitpid')
def test_arbiter_reap_deliberate_manager_exit_is_info(mock_os_waitpid):
# A deliberate stop (stopping flag set) is an expected exit: logged as info
# with no backoff, so a reload respawns without delay.
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter._companion_manager_stopping = True
arbiter.log = mock.Mock()
arbiter.reap_workers()
assert arbiter.companion_manager_pid == 0
assert arbiter._companion_manager_stopping is False
assert arbiter._companion_manager_respawn_at == 0
arbiter.log.info.assert_called_once()
arbiter.log.error.assert_not_called()
def test_arbiter_manage_companion_manager_waits_during_backoff():
# While inside the crash-backoff window the manager is not respawned.
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.cfg.set("companion_workers", [{"name": "rq", "target": "pkg:run"}])
arbiter._companion_manager_respawn_at = time.monotonic() + 60
arbiter.spawn_companion_manager = mock.Mock()
arbiter.manage_companion_manager()
arbiter.spawn_companion_manager.assert_not_called()
def test_stop_companion_manager_marks_expected_and_clears_backoff():
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter._companion_manager_failures = 3
arbiter._companion_manager_respawn_at = time.monotonic() + 60
with mock.patch("os.kill"):
arbiter.stop_companion_manager(signal.SIGTERM)
assert arbiter._companion_manager_stopping is True
assert arbiter._companion_manager_failures == 0
assert arbiter._companion_manager_respawn_at == 0
def test_stop_companion_manager_signals_running():
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242