feat(companion): Implement restart_process control command

Add restart_process(name) following supervisor's restart rules: it always
clears manual_stop. RUNNING/STARTING are sent their stop_signal and enter
STOPPING with restart_pending set and a deadline from reload_timeout; the
reaper respawns them immediately once the old child exits. BACKOFF and STOPPED
start again right away. STOPPING is rejected. It never rereads config.

handle_exit now honors restart_pending first, respawning immediately (bumping
restart_count) instead of going to STOPPED or BACKOFF. Add a restart_pending
field on CompanionProcess.

Add tests for the running, pending-reap, stopped, backoff, and stopping cases.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Tanmoy Sarkar 2026-06-09 18:10:40 +05:30
parent 8d9eb76e3d
commit 8e0ca34277
4 changed files with 96 additions and 4 deletions

View File

@ -679,7 +679,7 @@ No per-companion logic in Arbiter.
- [x] Implement `BACKOFF` with fixed `companion_restart_delay`.
- [x] Implement `start_process`.
- [x] Implement `stop_process`.
- [ ] Implement `restart_process`.
- [x] Implement `restart_process`.
- [ ] Preserve and clear `manual_stop` correctly.
- [ ] Add Unix control socket.
- [ ] Implement JSON command protocol.

View File

@ -107,6 +107,34 @@ class CompanionManager:
self.log.info("companion %s stopping (pid %s)", name, proc.pid)
return True, "%s stopping" % name
def restart_process(self, name: str, now: float = None):
"""Restart a companion by name (the control ``restart`` command).
Always clears ``manual_stop`` so the companion comes back. A live
companion (RUNNING or STARTING) is asked to stop -- it goes STOPPING
with ``restart_pending`` set and a deadline based on ``reload_timeout``,
and the reaper respawns it as soon as the old child exits. BACKOFF and
STOPPED start again immediately. STOPPING is rejected so the caller
retries. This never rereads config. Returns ``(ok, message)``.
"""
proc = self.processes.get(name)
if proc is None:
return False, "unknown companion %s" % name
if proc.state == State.STOPPING:
return False, "%s is stopping; retry" % name
proc.manual_stop = False
if proc.state in (State.RUNNING, State.STARTING):
now = now or time.time()
proc.restart_pending = True
os.kill(proc.pid, self._signal_number(proc.config.stop_signal))
proc.state = State.STOPPING
proc.stop_deadline = now + proc.config.reload_timeout
self.log.info("companion %s restarting (pid %s)", name, proc.pid)
return True, "%s restarting" % name
proc.next_retry_at = None
self.spawn_process(proc)
return True, "%s started" % name
@staticmethod
def _signal_number(sig) -> int:
"""Resolve a stop signal to its number, e.g. ``"SIGTERM"`` -> 15.
@ -157,14 +185,21 @@ class CompanionManager:
return reaped
def handle_exit(self, proc: CompanionProcess, now: float = None) -> None:
"""Decide a companion's fate after it exits: stay stopped or back off.
"""Decide a companion's fate after it exits: restart, stop, or back off.
A companion that was stopped on purpose settles in STOPPED and stays
there. Any other exit is unexpected, so it enters BACKOFF and is
A pending restart wins: the old child was asked to stop only so a fresh
one could take its place, so it is respawned immediately. Otherwise a
companion that was stopped on purpose settles in STOPPED and stays
there, and any other exit is unexpected, so it enters BACKOFF and is
scheduled to restart after a fixed ``restart_delay`` (no exponential
backoff, no retry cap).
"""
now = now or time.time()
if proc.restart_pending:
proc.restart_pending = False
proc.restart_count += 1
self.spawn_process(proc)
return
if proc.manual_stop:
proc.state = State.STOPPED
proc.next_retry_at = None

View File

@ -120,6 +120,7 @@ class CompanionProcess:
self.last_exit_signal = None
self.manual_stop = False
self.restart_pending = False
@property
def name(self):

View File

@ -218,6 +218,62 @@ def test_signal_number_rejects_bad():
CompanionManager._signal_number("SIGTRM")
def test_restart_process_running_stops_with_reload_timeout():
mgr = make_manager("rq")
proc = mgr.processes["rq"]
proc.state = State.RUNNING
proc.pid = 90
proc.config.reload_timeout = 30
proc.manual_stop = True
with mock.patch("os.kill") as kill:
ok, _ = mgr.restart_process("rq", now=300.0)
kill.assert_called_once_with(90, signal.SIGTERM)
assert ok and proc.state == State.STOPPING
assert proc.restart_pending is True and proc.stop_deadline == 330.0
assert proc.manual_stop is False
def test_restart_pending_reap_respawns_immediately():
mgr = make_manager("rq")
proc = mgr.processes["rq"]
proc.state = State.STOPPING
proc.restart_pending = True
proc.pid = 91
with mock.patch("os.waitpid", side_effect=[(91, 0), (0, 0)]), \
mock.patch("os.fork", return_value=92):
mgr.reap_processes()
assert proc.state == State.STARTING
assert proc.pid == 92
assert proc.restart_pending is False
assert proc.restart_count == 1
def test_restart_process_stopped_starts_now():
mgr = make_manager("rq")
proc = mgr.processes["rq"]
with mock.patch("os.fork", return_value=93), mock.patch("os.kill") as kill:
ok, _ = mgr.restart_process("rq")
kill.assert_not_called()
assert ok and proc.state == State.STARTING
def test_restart_process_backoff_starts_now():
mgr = make_manager("rq")
proc = mgr.processes["rq"]
proc.state = State.BACKOFF
proc.next_retry_at = 999.0
with mock.patch("os.fork", return_value=94):
ok, _ = mgr.restart_process("rq")
assert ok and proc.state == State.STARTING and proc.next_retry_at is None
def test_restart_process_stopping_rejected():
mgr = make_manager("rq")
mgr.processes["rq"].state = State.STOPPING
ok, msg = mgr.restart_process("rq")
assert not ok and "stopping" in msg
def test_handle_exit_unexpected_backoff():
mgr = make_manager("rq")
proc = mgr.processes["rq"]