From e21d23bfa69336009a70a4a4f5e908171475e779 Mon Sep 17 00:00:00 2001 From: Benoit Chesneau Date: Sat, 24 Jan 2026 20:18:45 +0100 Subject: [PATCH] fix(dirty): add orphan cleanup via well-known PID file When the main arbiter crashes and restarts, orphaned dirty arbiters may continue running. This adds detection and cleanup: - Add well-known PID file location based on proc_name - Dirty arbiter writes PID on startup, removes on exit - Main arbiter checks for orphans on fresh start (not USR2) - Uses self.proc_name for USR2 compatibility (myapp vs myapp.2) During USR2 upgrade, old and new dirty arbiters coexist with separate PID files, preventing the old from removing the new's file. --- gunicorn/arbiter.py | 63 +++++++++++- gunicorn/dirty/arbiter.py | 19 +++- tests/test_arbiter.py | 184 ++++++++++++++++++++++++++++++++++++ tests/test_dirty_arbiter.py | 120 +++++++++++++++++++++++ 4 files changed, 384 insertions(+), 2 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index 8944b706..9ec068c5 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -71,6 +71,7 @@ class Arbiter: # Dirty arbiter process self.dirty_arbiter_pid = 0 self.dirty_arbiter = None + self.dirty_pidfile = None # Well-known location for orphan detection cwd = util.getcwd() @@ -735,6 +736,57 @@ class Arbiter: # Dirty Arbiter Management # ========================================================================= + def _get_dirty_pidfile_path(self): + """Get the well-known PID file path for orphan detection. + + Uses self.proc_name (not self.cfg.proc_name) so that during USR2 + the new master gets a different PID file path ("myapp.2" vs "myapp"). + This prevents the old dirty arbiter from removing the new one's PID file. + """ + import tempfile + safe_name = self.proc_name.replace('/', '_').replace(' ', '_') + return os.path.join(tempfile.gettempdir(), f"gunicorn-dirty-{safe_name}.pid") + + def _cleanup_orphaned_dirty_arbiter(self): + """Kill any orphaned dirty arbiter from a previous crash. + + Only runs on fresh start (master_pid == 0), not during USR2. + """ + # During USR2, master_pid is set - don't cleanup old dirty arbiter + if self.master_pid != 0: + return + + pidfile = self._get_dirty_pidfile_path() + if not os.path.exists(pidfile): + return + + try: + with open(pidfile) as f: + old_pid = int(f.read().strip()) + + # Check if process exists + os.kill(old_pid, 0) + # Process exists - kill orphan + self.log.warning("Killing orphaned dirty arbiter (pid: %s)", old_pid) + os.kill(old_pid, signal.SIGTERM) + # Wait briefly for graceful exit + for _ in range(10): + time.sleep(0.1) + try: + os.kill(old_pid, 0) + except OSError: + break + else: + os.kill(old_pid, signal.SIGKILL) + except (ValueError, IOError, OSError): + pass + + # Remove stale PID file + try: + os.unlink(pidfile) + except OSError: + pass + def spawn_dirty_arbiter(self): """\ Spawn the dirty arbiter process. @@ -745,7 +797,16 @@ class Arbiter: if self.dirty_arbiter_pid: return # Already running - self.dirty_arbiter = DirtyArbiter(self.cfg, self.log) + # Cleanup any orphaned dirty arbiter from previous crash + self._cleanup_orphaned_dirty_arbiter() + + # Get well-known PID file path + self.dirty_pidfile = self._get_dirty_pidfile_path() + + self.dirty_arbiter = DirtyArbiter( + self.cfg, self.log, + pidfile=self.dirty_pidfile + ) socket_path = self.dirty_arbiter.socket_path pid = os.fork() diff --git a/gunicorn/dirty/arbiter.py b/gunicorn/dirty/arbiter.py index 4c2243fd..ae84fff3 100644 --- a/gunicorn/dirty/arbiter.py +++ b/gunicorn/dirty/arbiter.py @@ -44,7 +44,7 @@ class DirtyArbiter: # Worker boot error code WORKER_BOOT_ERROR = 3 - def __init__(self, cfg, log, socket_path=None): + def __init__(self, cfg, log, socket_path=None, pidfile=None): """ Initialize the dirty arbiter. @@ -52,11 +52,13 @@ class DirtyArbiter: cfg: Gunicorn config log: Logger socket_path: Path to the arbiter's Unix socket + pidfile: Well-known PID file location for orphan detection """ self.cfg = cfg self.log = log self.pid = None self.ppid = os.getpid() + self.pidfile = pidfile # Well-known location for orphan detection # Use a temp directory for sockets self.tmpdir = tempfile.mkdtemp(prefix="gunicorn-dirty-") @@ -82,6 +84,14 @@ class DirtyArbiter: self.pid = os.getpid() self.log.info("Dirty arbiter starting (pid: %s)", self.pid) + # Write PID to well-known location for orphan detection + if self.pidfile: + try: + with open(self.pidfile, 'w') as f: + f.write(str(self.pid)) + except IOError as e: + self.log.warning("Failed to write PID file: %s", e) + # Call hook self.cfg.on_dirty_starting(self) @@ -565,6 +575,13 @@ class DirtyArbiter: def _cleanup_sync(self): """Synchronous cleanup on exit.""" + # Remove PID file + if self.pidfile and os.path.exists(self.pidfile): + try: + os.unlink(self.pidfile) + except OSError: + pass + # Clean up socket if os.path.exists(self.socket_path): try: diff --git a/tests/test_arbiter.py b/tests/test_arbiter.py index ff855b46..d7e166ea 100644 --- a/tests/test_arbiter.py +++ b/tests/test_arbiter.py @@ -552,3 +552,187 @@ class TestWorkerLifecycle: arbiter.murder_workers() mock_kill.assert_called_once_with(42, signal.SIGKILL) + + +# ============================================================================ +# Dirty Arbiter Orphan Cleanup Tests +# ============================================================================ + +class TestDirtyArbiterOrphanCleanup: + """Tests for dirty arbiter orphan detection and cleanup.""" + + def test_get_dirty_pidfile_path(self): + """Verify pidfile path is generated correctly.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.proc_name = 'myapp' + + path = arbiter._get_dirty_pidfile_path() + + import tempfile + expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-myapp.pid') + assert path == expected + + def test_get_dirty_pidfile_path_sanitizes_name(self): + """Verify special characters in proc_name are sanitized.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.proc_name = 'my/app name' + + path = arbiter._get_dirty_pidfile_path() + + import tempfile + expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-my_app_name.pid') + assert path == expected + + def test_get_dirty_pidfile_path_uses_proc_name_not_cfg(self): + """Verify pidfile path uses self.proc_name for USR2 compatibility. + + During USR2, self.proc_name becomes 'myapp.2' while self.cfg.proc_name + stays 'myapp'. Using self.proc_name ensures new and old dirty arbiters + have different PID file paths. + """ + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.cfg.set('proc_name', 'myapp') + arbiter.proc_name = 'myapp.2' # Simulates USR2 child + + path = arbiter._get_dirty_pidfile_path() + + import tempfile + # Should use self.proc_name, not self.cfg.proc_name + expected = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-myapp.2.pid') + assert path == expected + + def test_cleanup_orphaned_skipped_during_usr2(self): + """Verify cleanup is skipped during USR2 upgrade (master_pid != 0).""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.master_pid = 12345 # Indicates USR2 upgrade in progress + + with mock.patch.object(arbiter, '_get_dirty_pidfile_path') as mock_path: + arbiter._cleanup_orphaned_dirty_arbiter() + + # Should not even check the pidfile path + mock_path.assert_not_called() + + def test_cleanup_orphaned_no_pidfile(self): + """Verify cleanup handles missing pidfile gracefully.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.master_pid = 0 + + with mock.patch('os.path.exists', return_value=False): + # Should not raise any exception + arbiter._cleanup_orphaned_dirty_arbiter() + + @mock.patch('os.unlink') + @mock.patch('os.kill') + def test_cleanup_orphaned_kills_existing_process(self, mock_kill, mock_unlink): + """Verify cleanup kills orphaned dirty arbiter process.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.master_pid = 0 + + # First kill(pid, 0) succeeds (process exists), then SIGTERM causes exit + mock_kill.side_effect = [None, None, OSError(3, "No such process")] + + import tempfile + pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid') + + with mock.patch('os.path.exists', return_value=True), \ + mock.patch('builtins.open', mock.mock_open(read_data='12345')), \ + mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \ + mock.patch('time.sleep'): + arbiter._cleanup_orphaned_dirty_arbiter() + + # Should have sent signal 0 (check), then SIGTERM + assert mock_kill.call_args_list[0] == mock.call(12345, 0) + assert mock_kill.call_args_list[1] == mock.call(12345, signal.SIGTERM) + # Should unlink the stale pidfile + mock_unlink.assert_called_with(pidfile) + + @mock.patch('os.unlink') + @mock.patch('os.kill') + def test_cleanup_orphaned_sigkill_if_sigterm_fails(self, mock_kill, mock_unlink): + """Verify cleanup sends SIGKILL if SIGTERM doesn't work.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.master_pid = 0 + + # Process exists on all checks until SIGKILL + def kill_side_effect(pid, sig): + if sig == signal.SIGKILL: + return None + return None # Process still running + + mock_kill.side_effect = kill_side_effect + + import tempfile + pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid') + + with mock.patch('os.path.exists', return_value=True), \ + mock.patch('builtins.open', mock.mock_open(read_data='12345')), \ + mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \ + mock.patch('time.sleep'): + arbiter._cleanup_orphaned_dirty_arbiter() + + # Should end with SIGKILL + kill_calls = [c for c in mock_kill.call_args_list if c[0][1] == signal.SIGKILL] + assert len(kill_calls) == 1 + + @mock.patch('os.unlink') + def test_cleanup_orphaned_stale_pidfile_no_process(self, mock_unlink): + """Verify cleanup removes stale pidfile when process doesn't exist.""" + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.master_pid = 0 + + import tempfile + pidfile = os.path.join(tempfile.gettempdir(), 'gunicorn-dirty-test.pid') + + with mock.patch('os.path.exists', return_value=True), \ + mock.patch('builtins.open', mock.mock_open(read_data='12345')), \ + mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile), \ + mock.patch('os.kill', side_effect=OSError(3, "No such process")): + arbiter._cleanup_orphaned_dirty_arbiter() + + # Should still unlink the stale pidfile + mock_unlink.assert_called_with(pidfile) + + @mock.patch('gunicorn.dirty.DirtyArbiter') + @mock.patch('os.fork') + def test_spawn_dirty_arbiter_calls_cleanup(self, mock_fork, mock_dirty_arbiter): + """Verify spawn_dirty_arbiter calls orphan cleanup before spawning.""" + mock_fork.return_value = 12345 # Parent process + mock_arbiter_instance = mock.Mock() + mock_arbiter_instance.socket_path = '/tmp/test.sock' + mock_dirty_arbiter.return_value = mock_arbiter_instance + + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.cfg.set('dirty_workers', 1) + arbiter.cfg.set('dirty_apps', ['test:app']) + + with mock.patch.object(arbiter, '_cleanup_orphaned_dirty_arbiter') as mock_cleanup, \ + mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value='/tmp/test.pid'), \ + mock.patch('gunicorn.dirty.set_dirty_socket_path'): + arbiter.spawn_dirty_arbiter() + + mock_cleanup.assert_called_once() + + @mock.patch('os.fork') + def test_spawn_dirty_arbiter_passes_pidfile(self, mock_fork): + """Verify spawn_dirty_arbiter passes pidfile to DirtyArbiter.""" + mock_fork.return_value = 12345 # Parent process + + arbiter = gunicorn.arbiter.Arbiter(DummyApplication()) + arbiter.cfg.set('dirty_workers', 1) + arbiter.cfg.set('dirty_apps', ['test:app']) + + pidfile_path = '/tmp/gunicorn-dirty-test.pid' + with mock.patch.object(arbiter, '_cleanup_orphaned_dirty_arbiter'), \ + mock.patch.object(arbiter, '_get_dirty_pidfile_path', return_value=pidfile_path), \ + mock.patch('gunicorn.arbiter.DirtyArbiter') as mock_dirty_arbiter, \ + mock.patch('gunicorn.arbiter.set_dirty_socket_path'): + mock_arbiter_instance = mock.Mock() + mock_arbiter_instance.socket_path = '/tmp/test.sock' + mock_dirty_arbiter.return_value = mock_arbiter_instance + + arbiter.spawn_dirty_arbiter() + + # Verify DirtyArbiter was called with pidfile parameter + mock_dirty_arbiter.assert_called_once() + call_kwargs = mock_dirty_arbiter.call_args[1] + assert call_kwargs.get('pidfile') == pidfile_path diff --git a/tests/test_dirty_arbiter.py b/tests/test_dirty_arbiter.py index 11bcd796..f6c855c7 100644 --- a/tests/test_dirty_arbiter.py +++ b/tests/test_dirty_arbiter.py @@ -120,6 +120,32 @@ class TestDirtyArbiterInit: # Cleanup arbiter._cleanup_sync() + def test_init_with_pidfile(self): + """Test initialization with pidfile parameter.""" + cfg = Config() + log = MockLog() + + with tempfile.TemporaryDirectory() as tmpdir: + pidfile = os.path.join(tmpdir, "dirty.pid") + arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile) + + assert arbiter.pidfile == pidfile + + # Cleanup + arbiter._cleanup_sync() + + def test_init_without_pidfile(self): + """Test initialization without pidfile parameter defaults to None.""" + cfg = Config() + log = MockLog() + + arbiter = DirtyArbiter(cfg=cfg, log=log) + + assert arbiter.pidfile is None + + # Cleanup + arbiter._cleanup_sync() + class TestDirtyArbiterCleanup: """Tests for arbiter cleanup.""" @@ -157,6 +183,100 @@ class TestDirtyArbiterCleanup: assert not os.path.exists(tmpdir) + def test_cleanup_removes_pidfile(self): + """Test that cleanup removes the PID file.""" + cfg = Config() + log = MockLog() + + with tempfile.TemporaryDirectory() as tmpdir: + pidfile = os.path.join(tmpdir, "dirty.pid") + arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile) + + # Create pidfile + with open(pidfile, 'w') as f: + f.write('12345') + + assert os.path.exists(pidfile) + + arbiter._cleanup_sync() + + assert not os.path.exists(pidfile) + + def test_cleanup_handles_missing_pidfile(self): + """Test that cleanup handles non-existent pidfile gracefully.""" + cfg = Config() + log = MockLog() + + with tempfile.TemporaryDirectory() as tmpdir: + pidfile = os.path.join(tmpdir, "nonexistent.pid") + arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile) + + # Don't create the file + assert not os.path.exists(pidfile) + + # Should not raise + arbiter._cleanup_sync() + + def test_cleanup_without_pidfile(self): + """Test that cleanup works when no pidfile configured.""" + cfg = Config() + log = MockLog() + + arbiter = DirtyArbiter(cfg=cfg, log=log) + assert arbiter.pidfile is None + + # Should not raise + arbiter._cleanup_sync() + + +class TestDirtyArbiterPidfileWrite: + """Tests for PID file writing during run().""" + + def test_run_writes_pidfile(self): + """Test that run() writes the PID to the pidfile.""" + from unittest import mock + + cfg = Config() + cfg.set("dirty_workers", 0) + cfg.set("dirty_apps", ["tests.support_dirty_app:TestDirtyApp"]) + log = MockLog() + + with tempfile.TemporaryDirectory() as tmpdir: + pidfile = os.path.join(tmpdir, "dirty.pid") + arbiter = DirtyArbiter(cfg=cfg, log=log, pidfile=pidfile) + + # Track if PID file was written correctly + pid_written = None + + def mock_asyncio_run(coro): + nonlocal pid_written + # At this point, PID file should have been written + if os.path.exists(pidfile): + with open(pidfile) as f: + pid_written = int(f.read().strip()) + + # Mock asyncio.run to check PID file before cleanup runs + with mock.patch.object(asyncio, 'run', side_effect=mock_asyncio_run): + arbiter.run() + + # Check PID was written correctly + assert pid_written == os.getpid() + + def test_run_without_pidfile_does_not_fail(self): + """Test that run() works when no pidfile configured.""" + from unittest import mock + + cfg = Config() + cfg.set("dirty_workers", 0) + cfg.set("dirty_apps", ["tests.support_dirty_app:TestDirtyApp"]) + log = MockLog() + + arbiter = DirtyArbiter(cfg=cfg, log=log) + + with mock.patch.object(asyncio, 'run'): + # Should not raise + arbiter.run() + class TestDirtyArbiterRouteRequest: """Tests for request routing."""