fix(dirty): add orphan cleanup via well-known PID file

When the main arbiter crashes and restarts, orphaned dirty arbiters
may continue running. This adds detection and cleanup:

- Add well-known PID file location based on proc_name
- Dirty arbiter writes PID on startup, removes on exit
- Main arbiter checks for orphans on fresh start (not USR2)
- Uses self.proc_name for USR2 compatibility (myapp vs myapp.2)

During USR2 upgrade, old and new dirty arbiters coexist with
separate PID files, preventing the old from removing the new's file.
This commit is contained in:
Benoit Chesneau 2026-01-24 20:18:45 +01:00
parent f6418d4eb0
commit e21d23bfa6
4 changed files with 384 additions and 2 deletions

View File

@ -71,6 +71,7 @@ class Arbiter:
# Dirty arbiter process
self.dirty_arbiter_pid = 0
self.dirty_arbiter = None
self.dirty_pidfile = None # Well-known location for orphan detection
cwd = util.getcwd()
@ -735,6 +736,57 @@ class Arbiter:
# Dirty Arbiter Management
# =========================================================================
def _get_dirty_pidfile_path(self):
"""Get the well-known PID file path for orphan detection.
Uses self.proc_name (not self.cfg.proc_name) so that during USR2
the new master gets a different PID file path ("myapp.2" vs "myapp").
This prevents the old dirty arbiter from removing the new one's PID file.
"""
import tempfile
safe_name = self.proc_name.replace('/', '_').replace(' ', '_')
return os.path.join(tempfile.gettempdir(), f"gunicorn-dirty-{safe_name}.pid")
def _cleanup_orphaned_dirty_arbiter(self):
"""Kill any orphaned dirty arbiter from a previous crash.
Only runs on fresh start (master_pid == 0), not during USR2.
"""
# During USR2, master_pid is set - don't cleanup old dirty arbiter
if self.master_pid != 0:
return
pidfile = self._get_dirty_pidfile_path()
if not os.path.exists(pidfile):
return
try:
with open(pidfile) as f:
old_pid = int(f.read().strip())
# Check if process exists
os.kill(old_pid, 0)
# Process exists - kill orphan
self.log.warning("Killing orphaned dirty arbiter (pid: %s)", old_pid)
os.kill(old_pid, signal.SIGTERM)
# Wait briefly for graceful exit
for _ in range(10):
time.sleep(0.1)
try:
os.kill(old_pid, 0)
except OSError:
break
else:
os.kill(old_pid, signal.SIGKILL)
except (ValueError, IOError, OSError):
pass
# Remove stale PID file
try:
os.unlink(pidfile)
except OSError:
pass
def spawn_dirty_arbiter(self):
"""\
Spawn the dirty arbiter process.
@ -745,7 +797,16 @@ class Arbiter:
if self.dirty_arbiter_pid:
return # Already running
self.dirty_arbiter = DirtyArbiter(self.cfg, self.log)
# Cleanup any orphaned dirty arbiter from previous crash
self._cleanup_orphaned_dirty_arbiter()
# Get well-known PID file path
self.dirty_pidfile = self._get_dirty_pidfile_path()
self.dirty_arbiter = DirtyArbiter(
self.cfg, self.log,
pidfile=self.dirty_pidfile
)
socket_path = self.dirty_arbiter.socket_path
pid = os.fork()

View File

@ -44,7 +44,7 @@ class DirtyArbiter:
# Worker boot error code
WORKER_BOOT_ERROR = 3
def __init__(self, cfg, log, socket_path=None):
def __init__(self, cfg, log, socket_path=None, pidfile=None):
"""
Initialize the dirty arbiter.
@ -52,11 +52,13 @@ class DirtyArbiter:
cfg: Gunicorn config
log: Logger
socket_path: Path to the arbiter's Unix socket
pidfile: Well-known PID file location for orphan detection
"""
self.cfg = cfg
self.log = log
self.pid = None
self.ppid = os.getpid()
self.pidfile = pidfile # Well-known location for orphan detection
# Use a temp directory for sockets
self.tmpdir = tempfile.mkdtemp(prefix="gunicorn-dirty-")
@ -82,6 +84,14 @@ class DirtyArbiter:
self.pid = os.getpid()
self.log.info("Dirty arbiter starting (pid: %s)", self.pid)
# Write PID to well-known location for orphan detection
if self.pidfile:
try:
with open(self.pidfile, 'w') as f:
f.write(str(self.pid))
except IOError as e:
self.log.warning("Failed to write PID file: %s", e)
# Call hook
self.cfg.on_dirty_starting(self)
@ -565,6 +575,13 @@ class DirtyArbiter:
def _cleanup_sync(self):
"""Synchronous cleanup on exit."""
# Remove PID file
if self.pidfile and os.path.exists(self.pidfile):
try:
os.unlink(self.pidfile)
except OSError:
pass
# Clean up socket
if os.path.exists(self.socket_path):
try:

View File

@ -552,3 +552,187 @@ class TestWorkerLifecycle:
arbiter.murder_workers()
mock_kill.assert_called_once_with(42, signal.SIGKILL)
# ============================================================================
# Dirty Arbiter Orphan Cleanup Tests
# ============================================================================
class TestDirtyArbiterOrphanCleanup:
"""Tests for dirty arbiter orphan detection and cleanup."""
def test_get_dirty_pidfile_path(self):
"""Verify pidfile path is generated correctly."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.proc_name = 'myapp'
path = arbiter._get_dirty_pidfile_path()
import tempfile
expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-myapp.pid')
assert path == expected
def test_get_dirty_pidfile_path_sanitizes_name(self):
"""Verify special characters in proc_name are sanitized."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.proc_name = 'my/app name'
path = arbiter._get_dirty_pidfile_path()
import tempfile
expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-my_app_name.pid')
assert path == expected
def test_get_dirty_pidfile_path_uses_proc_name_not_cfg(self):
"""Verify pidfile path uses self.proc_name for USR2 compatibility.
During USR2, self.proc_name becomes 'myapp.2' while self.cfg.proc_name
stays 'myapp'. Using self.proc_name ensures new and old dirty arbiters
have different PID file paths.
"""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.cfg.set('proc_name', 'myapp')
arbiter.proc_name = 'myapp.2' # Simulates USR2 child
path = arbiter._get_dirty_pidfile_path()
import tempfile
# Should use self.proc_name, not self.cfg.proc_name
expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-myapp.2.pid')
assert path == expected
def test_cleanup_orphaned_skipped_during_usr2(self):
"""Verify cleanup is skipped during USR2 upgrade (master_pid != 0)."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.master_pid = 12345 # Indicates USR2 upgrade in progress
with mock.patch.object(arbiter, '_get_dirty_pidfile_path') as mock_path:
arbiter._cleanup_orphaned_dirty_arbiter()
# Should not even check the pidfile path
mock_path.assert_not_called()
def test_cleanup_orphaned_no_pidfile(self):
"""Verify cleanup handles missing pidfile gracefully."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.master_pid = 0
with mock.patch('os.path.exists', return_value=False):
# Should not raise any exception
arbiter._cleanup_orphaned_dirty_arbiter()
@mock.patch('os.unlink')
@mock.patch('os.kill')
def test_cleanup_orphaned_kills_existing_process(self, mock_kill, mock_unlink):
"""Verify cleanup kills orphaned dirty arbiter process."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.master_pid = 0
# First kill(pid, 0) succeeds (process exists), then SIGTERM causes exit
mock_kill.side_effect = [None, None, OSError(3, "No such process")]
import tempfile
pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid')
with mock.patch('os.path.exists', return_value=True), \
mock.patch('builtins.open', mock.mock_open(read_data='12345')), \
mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \
mock.patch('time.sleep'):
arbiter._cleanup_orphaned_dirty_arbiter()
# Should have sent signal 0 (check), then SIGTERM
assert mock_kill.call_args_list[0] == mock.call(12345, 0)
assert mock_kill.call_args_list[1] == mock.call(12345, signal.SIGTERM)
# Should unlink the stale pidfile
mock_unlink.assert_called_with(pidfile)
@mock.patch('os.unlink')
@mock.patch('os.kill')
def test_cleanup_orphaned_sigkill_if_sigterm_fails(self, mock_kill, mock_unlink):
"""Verify cleanup sends SIGKILL if SIGTERM doesn't work."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.master_pid = 0
# Process exists on all checks until SIGKILL
def kill_side_effect(pid, sig):
if sig == signal.SIGKILL:
return None
return None # Process still running
mock_kill.side_effect = kill_side_effect
import tempfile
pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid')
with mock.patch('os.path.exists', return_value=True), \
mock.patch('builtins.open', mock.mock_open(read_data='12345')), \
mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \
mock.patch('time.sleep'):
arbiter._cleanup_orphaned_dirty_arbiter()
# Should end with SIGKILL
kill_calls = [c for c in mock_kill.call_args_list if c[0][1] == signal.SIGKILL]
assert len(kill_calls) == 1
@mock.patch('os.unlink')
def test_cleanup_orphaned_stale_pidfile_no_process(self, mock_unlink):
"""Verify cleanup removes stale pidfile when process doesn't exist."""
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.master_pid = 0
import tempfile
pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid')
with mock.patch('os.path.exists', return_value=True), \
mock.patch('builtins.open', mock.mock_open(read_data='12345')), \
mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \
mock.patch('os.kill', side_effect=OSError(3, "No such process")):
arbiter._cleanup_orphaned_dirty_arbiter()
# Should still unlink the stale pidfile
mock_unlink.assert_called_with(pidfile)
@mock.patch('gunicorn.dirty.DirtyArbiter')
@mock.patch('os.fork')
def test_spawn_dirty_arbiter_calls_cleanup(self, mock_fork, mock_dirty_arbiter):
"""Verify spawn_dirty_arbiter calls orphan cleanup before spawning."""
mock_fork.return_value = 12345 # Parent process
mock_arbiter_instance = mock.Mock()
mock_arbiter_instance.socket_path = '/tmp/test.sock'
mock_dirty_arbiter.return_value = mock_arbiter_instance
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.cfg.set('dirty_workers', 1)
arbiter.cfg.set('dirty_apps', ['test:app'])
with mock.patch.object(arbiter, '_cleanup_orphaned_dirty_arbiter') as mock_cleanup, \
mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value='/tmp/test.pid'), \
mock.patch('gunicorn.dirty.set_dirty_socket_path'):
arbiter.spawn_dirty_arbiter()
mock_cleanup.assert_called_once()
@mock.patch('os.fork')
def test_spawn_dirty_arbiter_passes_pidfile(self, mock_fork):
"""Verify spawn_dirty_arbiter passes pidfile to DirtyArbiter."""
mock_fork.return_value = 12345 # Parent process
arbiter = gunicorn.arbiter.Arbiter(DummyApplication())
arbiter.cfg.set('dirty_workers', 1)
arbiter.cfg.set('dirty_apps', ['test:app'])
pidfile_path = '/tmp/gunicorn-dirty-test.pid'
with mock.patch.object(arbiter, '_cleanup_orphaned_dirty_arbiter'), \
mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile_path), \
mock.patch('gunicorn.arbiter.DirtyArbiter') as mock_dirty_arbiter, \
mock.patch('gunicorn.arbiter.set_dirty_socket_path'):
mock_arbiter_instance = mock.Mock()
mock_arbiter_instance.socket_path = '/tmp/test.sock'
mock_dirty_arbiter.return_value = mock_arbiter_instance
arbiter.spawn_dirty_arbiter()
# Verify DirtyArbiter was called with pidfile parameter
mock_dirty_arbiter.assert_called_once()
call_kwargs = mock_dirty_arbiter.call_args[1]
assert call_kwargs.get('pidfile') == pidfile_path

View File

@ -120,6 +120,32 @@ class TestDirtyArbiterInit:
# Cleanup
arbiter._cleanup_sync()
def test_init_with_pidfile(self):
"""Test initialization with pidfile parameter."""
cfg = Config()
log = MockLog()
with tempfile.TemporaryDirectory() as tmpdir:
pidfile = os.path.join(tmpdir, "dirty.pid")
arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile)
assert arbiter.pidfile == pidfile
# Cleanup
arbiter._cleanup_sync()
def test_init_without_pidfile(self):
"""Test initialization without pidfile parameter defaults to None."""
cfg = Config()
log = MockLog()
arbiter = DirtyArbiter(cfg=cfg, log=log)
assert arbiter.pidfile is None
# Cleanup
arbiter._cleanup_sync()
class TestDirtyArbiterCleanup:
"""Tests for arbiter cleanup."""
@ -157,6 +183,100 @@ class TestDirtyArbiterCleanup:
assert not os.path.exists(tmpdir)
def test_cleanup_removes_pidfile(self):
"""Test that cleanup removes the PID file."""
cfg = Config()
log = MockLog()
with tempfile.TemporaryDirectory() as tmpdir:
pidfile = os.path.join(tmpdir, "dirty.pid")
arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile)
# Create pidfile
with open(pidfile, 'w') as f:
f.write('12345')
assert os.path.exists(pidfile)
arbiter._cleanup_sync()
assert not os.path.exists(pidfile)
def test_cleanup_handles_missing_pidfile(self):
"""Test that cleanup handles non-existent pidfile gracefully."""
cfg = Config()
log = MockLog()
with tempfile.TemporaryDirectory() as tmpdir:
pidfile = os.path.join(tmpdir, "nonexistent.pid")
arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile)
# Don't create the file
assert not os.path.exists(pidfile)
# Should not raise
arbiter._cleanup_sync()
def test_cleanup_without_pidfile(self):
"""Test that cleanup works when no pidfile configured."""
cfg = Config()
log = MockLog()
arbiter = DirtyArbiter(cfg=cfg, log=log)
assert arbiter.pidfile is None
# Should not raise
arbiter._cleanup_sync()
class TestDirtyArbiterPidfileWrite:
"""Tests for PID file writing during run()."""
def test_run_writes_pidfile(self):
"""Test that run() writes the PID to the pidfile."""
from unittest import mock
cfg = Config()
cfg.set("dirty_workers", 0)
cfg.set("dirty_apps", ["tests.support_dirty_app:TestDirtyApp"])
log = MockLog()
with tempfile.TemporaryDirectory() as tmpdir:
pidfile = os.path.join(tmpdir, "dirty.pid")
arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile)
# Track if PID file was written correctly
pid_written = None
def mock_asyncio_run(coro):
nonlocal pid_written
# At this point, PID file should have been written
if os.path.exists(pidfile):
with open(pidfile) as f:
pid_written = int(f.read().strip())
# Mock asyncio.run to check PID file before cleanup runs
with mock.patch.object(asyncio, 'run', side_effect=mock_asyncio_run):
arbiter.run()
# Check PID was written correctly
assert pid_written == os.getpid()
def test_run_without_pidfile_does_not_fail(self):
"""Test that run() works when no pidfile configured."""
from unittest import mock
cfg = Config()
cfg.set("dirty_workers", 0)
cfg.set("dirty_apps", ["tests.support_dirty_app:TestDirtyApp"])
log = MockLog()
arbiter = DirtyArbiter(cfg=cfg, log=log)
with mock.patch.object(asyncio, 'run'):
# Should not raise
arbiter.run()
class TestDirtyArbiterRouteRequest:
"""Tests for request routing."""