feat(companion): Add lifecycle logs for companion transitions

Fill the gaps in the manager's lifecycle logging. Every reaped companion now
logs how it exited (signal vs status) before its fate is decided, and
handle_exit logs the decision: restarting, stopped when stopped on purpose, or
backing off with the retry delay. stop_all brackets shutdown with 'stopping all
companions' and 'all companions stopped', run() logs when the manager stops,
and reread_config logs an added/removed/restarted/unchanged summary.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Tanmoy Sarkar 2026-06-09 23:01:20 +05:30
parent 9c4d81726d
commit 465aff870d
3 changed files with 32 additions and 2 deletions

View File

@ -694,7 +694,7 @@ No per-companion logic in Arbiter.
- [x] Close Gunicorn-only fds in manager child.
- [x] Close manager-only fds in companion child.
- [x] Add parent-death cleanup.
- [ ] Add lifecycle logs.
- [x] Add lifecycle logs.
- [ ] Add tests for config validation.
- [ ] Add tests for state transitions.
- [ ] Add tests for control commands.

View File

@ -98,6 +98,7 @@ class CompanionManager:
finally:
if self.control is not None:
self.control.close()
self.log.info("companion manager stopped (pid %s)", self.pid)
def _parent_gone(self) -> bool:
"""True once the arbiter that forked the manager has exited."""
@ -164,6 +165,7 @@ class CompanionManager:
deadlines until they are all gone, so the manager exits without leaving
orphaned companions behind.
"""
self.log.info("stopping all companions")
for name in list(self.processes):
self.stop_process(name)
while any(process.pid is not None for process in self.processes.values()):
@ -171,6 +173,7 @@ class CompanionManager:
self.enforce_deadlines(now)
self.reap_processes()
self._wait(timeout=0.2)
self.log.info("all companions stopped")
def _install_signals(self) -> None:
"""Set up the self-pipe and signal handlers for the supervision loop."""
@ -283,6 +286,9 @@ class CompanionManager:
self.restart_process(name)
restarted.append(name)
self.log.info(
"companion reread applied: added %s, removed %s, restarted %s, unchanged %s",
added, removed, restarted, unchanged)
return {"ok": True, "added": added, "removed": removed,
"restarted": restarted, "unchanged": unchanged}
@ -474,10 +480,20 @@ class CompanionManager:
process = self._process_by_pid(pid)
if process is not None:
self._record_exit(process, status)
self._log_exit(process)
self.handle_exit(process)
reaped.append(process)
return reaped
def _log_exit(self, process: CompanionProcess) -> None:
"""Log how a reaped companion exited, before its fate is decided."""
if process.last_exit_signal is not None:
self.log.info("companion %s exited on signal %s",
process.name, process.last_exit_signal)
else:
self.log.info("companion %s exited with status %s",
process.name, process.last_exit_code)
def handle_exit(self, process: CompanionProcess, now: float = None) -> None:
"""Decide a companion's fate after it exits: restart, stop, or back off.
@ -492,15 +508,17 @@ class CompanionManager:
if process.restart_pending:
process.restart_pending = False
process.restart_count += 1
self.log.info("companion %s restarting", process.name)
self.spawn_process(process)
return
if process.manual_stop:
process.state = State.STOPPED
process.next_retry_at = None
self.log.info("companion %s stopped", process.name)
return
process.state = State.BACKOFF
process.next_retry_at = now + process.restart_delay
self.log.info("companion %s exited, retrying in %ss",
self.log.info("companion %s backing off, retrying in %ss",
process.name, process.restart_delay)
def retry_backoff(self, now: float = None) -> list:

View File

@ -40,6 +40,18 @@ def test_resolve_target_rejects_bad_string():
CompanionManager._resolve_target("no_colon")
def test_log_exit_reports_signal_or_status():
manager = make_manager("rq")
process = manager.processes["rq"]
process.last_exit_signal, process.last_exit_code = 9, None
manager._log_exit(process)
process.last_exit_signal, process.last_exit_code = None, 1
manager._log_exit(process)
messages = [call.args[0] for call in manager.log.info.call_args_list]
assert any("signal" in message for message in messages)
assert any("status" in message for message in messages)
def test_set_parent_death_signal_noop_off_linux():
with mock.patch("sys.platform", "darwin"):
assert set_parent_death_signal(signal.SIGTERM) is False