mirror of
https://github.com/frappe/gunicorn.git
synced 2026-07-01 10:11:30 +08:00
fix(companion): Back off manager respawn and quiet expected exits
The arbiter respawned the companion manager on every main-loop tick once its pid cleared, so a manager that could not boot would busy-spin. It also logged every manager exit as an error, including the deliberate exits from shutdown and reload. Track whether a manager exit was on purpose: stop_companion_manager marks it expected and clears any backoff, so the reaper logs it as info and a reload respawns without delay. An unexpected exit now arms an exponential crash backoff (2^(n-1)s, capped at 30s) that the main loop waits out before respawning, and is logged as an error. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
10a41a1cc5
commit
7f39839a8c
@ -36,6 +36,10 @@ class Arbiter:
|
||||
# A flag indicating if an application failed to be loaded
|
||||
APP_LOAD_ERROR = 4
|
||||
|
||||
# Cap on the crash-backoff delay before respawning a companion manager
|
||||
# that keeps exiting unexpectedly, so a crash loop cannot busy-spin.
|
||||
COMPANION_MANAGER_MAX_RESPAWN_DELAY = 30
|
||||
|
||||
START_CTX = {}
|
||||
|
||||
LISTENERS = []
|
||||
@ -67,6 +71,14 @@ class Arbiter:
|
||||
self.master_pid = 0
|
||||
self.master_name = "Master"
|
||||
self.companion_manager_pid = 0
|
||||
# True while a manager exit is expected (deliberate stop or reload), so
|
||||
# the reaper logs it as info instead of an unexpected-crash error.
|
||||
self._companion_manager_stopping = False
|
||||
# Crash backoff: earliest monotonic time the main loop may respawn a
|
||||
# manager that exited unexpectedly, plus the consecutive-crash count
|
||||
# that sizes the delay.
|
||||
self._companion_manager_respawn_at = 0
|
||||
self._companion_manager_failures = 0
|
||||
# Configs of the currently running companion manager, cached at spawn so
|
||||
# shutdown can size its wait without re-reading the config file.
|
||||
self._companion_configs = []
|
||||
@ -539,8 +551,23 @@ class Arbiter:
|
||||
# The manager itself exited; clear its pid so the main
|
||||
# loop respawns it. It owns its companions' lifecycles.
|
||||
self.companion_manager_pid = 0
|
||||
self.log.error(
|
||||
if self._companion_manager_stopping:
|
||||
# Expected exit from a deliberate stop or reload.
|
||||
self._companion_manager_stopping = False
|
||||
self.log.info(
|
||||
"Companion manager (pid:%s) exited", wpid)
|
||||
else:
|
||||
# Unexpected crash: back off before respawning so a
|
||||
# manager that cannot boot does not busy-spin.
|
||||
self._companion_manager_failures += 1
|
||||
delay = min(
|
||||
2 ** (self._companion_manager_failures - 1),
|
||||
self.COMPANION_MANAGER_MAX_RESPAWN_DELAY)
|
||||
self._companion_manager_respawn_at = (
|
||||
time.monotonic() + delay)
|
||||
self.log.error(
|
||||
"Companion manager (pid:%s) exited unexpectedly; "
|
||||
"respawning in %ss", wpid, delay)
|
||||
else:
|
||||
# A worker was terminated. If the termination reason was
|
||||
# that it could not boot, we'll shut it down to avoid
|
||||
@ -674,7 +701,11 @@ class Arbiter:
|
||||
child of the arbiter; per-companion supervision lives entirely inside
|
||||
it, so the arbiter only ensures the one manager process exists.
|
||||
"""
|
||||
if self.companion_manager_pid == 0 and self.cfg.companion_workers:
|
||||
if not (self.companion_manager_pid == 0 and self.cfg.companion_workers):
|
||||
return
|
||||
if time.monotonic() < self._companion_manager_respawn_at:
|
||||
# Still inside the crash-backoff window; wait before respawning.
|
||||
return
|
||||
self.spawn_companion_manager()
|
||||
|
||||
def spawn_companion_manager(self):
|
||||
@ -785,6 +816,11 @@ class Arbiter:
|
||||
"""
|
||||
if self.companion_manager_pid == 0:
|
||||
return
|
||||
# This exit is on purpose: mark it expected and clear any crash backoff
|
||||
# so the reaper logs info and a reload can respawn without delay.
|
||||
self._companion_manager_stopping = True
|
||||
self._companion_manager_failures = 0
|
||||
self._companion_manager_respawn_at = 0
|
||||
try:
|
||||
os.kill(self.companion_manager_pid, sig)
|
||||
except OSError as e:
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
import errno
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
import gunicorn.app.base
|
||||
@ -183,6 +184,61 @@ def test_arbiter_reap_clears_companion_manager_pid(mock_os_waitpid):
|
||||
assert arbiter.companion_manager_pid == 0
|
||||
|
||||
|
||||
@mock.patch('os.waitpid')
|
||||
def test_arbiter_reap_unexpected_manager_exit_backs_off(mock_os_waitpid):
|
||||
# An unexpected manager exit (no deliberate stop) is an error and arms the
|
||||
# crash backoff so the main loop does not respawn it immediately.
|
||||
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
|
||||
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
|
||||
arbiter.companion_manager_pid = 4242
|
||||
arbiter.log = mock.Mock()
|
||||
arbiter.reap_workers()
|
||||
assert arbiter.companion_manager_pid == 0
|
||||
assert arbiter._companion_manager_failures == 1
|
||||
assert arbiter._companion_manager_respawn_at > 0
|
||||
arbiter.log.error.assert_called_once()
|
||||
arbiter.log.info.assert_not_called()
|
||||
|
||||
|
||||
@mock.patch('os.waitpid')
|
||||
def test_arbiter_reap_deliberate_manager_exit_is_info(mock_os_waitpid):
|
||||
# A deliberate stop (stopping flag set) is an expected exit: logged as info
|
||||
# with no backoff, so a reload respawns without delay.
|
||||
mock_os_waitpid.side_effect = [(4242, 0), (0, 0)]
|
||||
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
|
||||
arbiter.companion_manager_pid = 4242
|
||||
arbiter._companion_manager_stopping = True
|
||||
arbiter.log = mock.Mock()
|
||||
arbiter.reap_workers()
|
||||
assert arbiter.companion_manager_pid == 0
|
||||
assert arbiter._companion_manager_stopping is False
|
||||
assert arbiter._companion_manager_respawn_at == 0
|
||||
arbiter.log.info.assert_called_once()
|
||||
arbiter.log.error.assert_not_called()
|
||||
|
||||
|
||||
def test_arbiter_manage_companion_manager_waits_during_backoff():
|
||||
# While inside the crash-backoff window the manager is not respawned.
|
||||
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
|
||||
arbiter.cfg.set("companion_workers", [{"name": "rq", "target": "pkg:run"}])
|
||||
arbiter._companion_manager_respawn_at = time.monotonic() + 60
|
||||
arbiter.spawn_companion_manager = mock.Mock()
|
||||
arbiter.manage_companion_manager()
|
||||
arbiter.spawn_companion_manager.assert_not_called()
|
||||
|
||||
|
||||
def test_stop_companion_manager_marks_expected_and_clears_backoff():
|
||||
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
|
||||
arbiter.companion_manager_pid = 4242
|
||||
arbiter._companion_manager_failures = 3
|
||||
arbiter._companion_manager_respawn_at = time.monotonic() + 60
|
||||
with mock.patch("os.kill"):
|
||||
arbiter.stop_companion_manager(signal.SIGTERM)
|
||||
assert arbiter._companion_manager_stopping is True
|
||||
assert arbiter._companion_manager_failures == 0
|
||||
assert arbiter._companion_manager_respawn_at == 0
|
||||
|
||||
|
||||
def test_stop_companion_manager_signals_running():
|
||||
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
|
||||
arbiter.companion_manager_pid = 4242
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user