fix(companion): Back off manager respawn and quiet expected exits

The arbiter respawned the companion manager on every main-loop tick once
its pid cleared, so a manager that could not boot would busy-spin. It also
logged every manager exit as an error, including the deliberate exits from
shutdown and reload.

Track whether a manager exit was on purpose: stop_companion_manager marks
it expected and clears any backoff, so the reaper logs it as info and a
reload respawns without delay. An unexpected exit now arms an exponential
crash backoff (2^(n-1)s, capped at 30s) that the main loop waits out before
respawning, and is logged as an error.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Tanmoy Sarkar 2026-06-13 11:42:02 +05:30
parent 10a41a1cc5
commit 7f39839a8c
2 changed files with 96 additions and 4 deletions

View File

@ -36,6 +36,10 @@ class Arbiter:
# A flag indicating if an application failed to be loaded # A flag indicating if an application failed to be loaded
APP_LOAD_ERROR = 4 APP_LOAD_ERROR = 4
# Cap on the crash-backoff delay before respawning a companion manager
# that keeps exiting unexpectedly, so a crash loop cannot busy-spin.
COMPANION_MANAGER_MAX_RESPAWN_DELAY = 30
START_CTX = {} START_CTX = {}
LISTENERS = [] LISTENERS = []
@ -67,6 +71,14 @@ class Arbiter:
self.master_pid = 0 self.master_pid = 0
self.master_name = "Master" self.master_name = "Master"
self.companion_manager_pid = 0 self.companion_manager_pid = 0
# True while a manager exit is expected (deliberate stop or reload), so
# the reaper logs it as info instead of an unexpected-crash error.
self._companion_manager_stopping = False
# Crash backoff: earliest monotonic time the main loop may respawn a
# manager that exited unexpectedly, plus the consecutive-crash count
# that sizes the delay.
self._companion_manager_respawn_at = 0
self._companion_manager_failures = 0
# Configs of the currently running companion manager, cached at spawn so # Configs of the currently running companion manager, cached at spawn so
# shutdown can size its wait without re-reading the config file. # shutdown can size its wait without re-reading the config file.
self._companion_configs = [] self._companion_configs = []
@ -539,8 +551,23 @@ class Arbiter:
# The manager itself exited; clear its pid so the main # The manager itself exited; clear its pid so the main
# loop respawns it. It owns its companions' lifecycles. # loop respawns it. It owns its companions' lifecycles.
self.companion_manager_pid = 0 self.companion_manager_pid = 0
self.log.error( if self._companion_manager_stopping:
"Companion manager (pid:%s) exited", wpid) # Expected exit from a deliberate stop or reload.
self._companion_manager_stopping = False
self.log.info(
"Companion manager (pid:%s) exited", wpid)
else:
# Unexpected crash: back off before respawning so a
# manager that cannot boot does not busy-spin.
self._companion_manager_failures += 1
delay = min(
2 ** (self._companion_manager_failures - 1),
self.COMPANION_MANAGER_MAX_RESPAWN_DELAY)
self._companion_manager_respawn_at = (
time.monotonic() + delay)
self.log.error(
"Companion manager (pid:%s) exited unexpectedly; "
"respawning in %ss", wpid, delay)
else: else:
# A worker was terminated. If the termination reason was # A worker was terminated. If the termination reason was
# that it could not boot, we'll shut it down to avoid # that it could not boot, we'll shut it down to avoid
@ -674,8 +701,12 @@ class Arbiter:
child of the arbiter; per-companion supervision lives entirely inside child of the arbiter; per-companion supervision lives entirely inside
it, so the arbiter only ensures the one manager process exists. it, so the arbiter only ensures the one manager process exists.
""" """
if self.companion_manager_pid == 0 and self.cfg.companion_workers: if not (self.companion_manager_pid == 0 and self.cfg.companion_workers):
self.spawn_companion_manager() return
if time.monotonic() < self._companion_manager_respawn_at:
# Still inside the crash-backoff window; wait before respawning.
return
self.spawn_companion_manager()
def spawn_companion_manager(self): def spawn_companion_manager(self):
"""Fork the companion manager process. """Fork the companion manager process.
@ -785,6 +816,11 @@ class Arbiter:
""" """
if self.companion_manager_pid == 0: if self.companion_manager_pid == 0:
return return
# This exit is on purpose: mark it expected and clear any crash backoff
# so the reaper logs info and a reload can respawn without delay.
self._companion_manager_stopping = True
self._companion_manager_failures = 0
self._companion_manager_respawn_at = 0
try: try:
os.kill(self.companion_manager_pid, sig) os.kill(self.companion_manager_pid, sig)
except OSError as e: except OSError as e:

View File

@ -5,6 +5,7 @@
import errno import errno
import os import os
import signal import signal
import time
from unittest import mock from unittest import mock
import gunicorn.app.base import gunicorn.app.base
@ -183,6 +184,61 @@ def test_arbiter_reap_clears_companion_manager_pid(mock_os_waitpid):
assert arbiter.companion_manager_pid == 0 assert arbiter.companion_manager_pid == 0
@mock.patch('os.waitpid')
def test_arbiter_reap_unexpected_manager_exit_backs_off(mock_os_waitpid):
# An unexpected manager exit (no deliberate stop) is an error and arms the
# crash backoff so the main loop does not respawn it immediately.
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter.log = mock.Mock()
arbiter.reap_workers()
assert arbiter.companion_manager_pid == 0
assert arbiter._companion_manager_failures == 1
assert arbiter._companion_manager_respawn_at > 0
arbiter.log.error.assert_called_once()
arbiter.log.info.assert_not_called()
@mock.patch('os.waitpid')
def test_arbiter_reap_deliberate_manager_exit_is_info(mock_os_waitpid):
# A deliberate stop (stopping flag set) is an expected exit: logged as info
# with no backoff, so a reload respawns without delay.
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter._companion_manager_stopping = True
arbiter.log = mock.Mock()
arbiter.reap_workers()
assert arbiter.companion_manager_pid == 0
assert arbiter._companion_manager_stopping is False
assert arbiter._companion_manager_respawn_at == 0
arbiter.log.info.assert_called_once()
arbiter.log.error.assert_not_called()
def test_arbiter_manage_companion_manager_waits_during_backoff():
# While inside the crash-backoff window the manager is not respawned.
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.cfg.set("companion_workers", [{"name": "rq", "target": "pkg:run"}])
arbiter._companion_manager_respawn_at = time.monotonic() + 60
arbiter.spawn_companion_manager = mock.Mock()
arbiter.manage_companion_manager()
arbiter.spawn_companion_manager.assert_not_called()
def test_stop_companion_manager_marks_expected_and_clears_backoff():
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242
arbiter._companion_manager_failures = 3
arbiter._companion_manager_respawn_at = time.monotonic() + 60
with mock.patch("os.kill"):
arbiter.stop_companion_manager(signal.SIGTERM)
assert arbiter._companion_manager_stopping is True
assert arbiter._companion_manager_failures == 0
assert arbiter._companion_manager_respawn_at == 0
def test_stop_companion_manager_signals_running(): def test_stop_companion_manager_signals_running():
arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.companion_manager_pid = 4242 arbiter.companion_manager_pid = 4242