修复Restart Agent processes导致cpu100%卡住不动的问题,拆分Restart Agent processes 并优化Clone Agent Repository步骤

This commit is contained in:
jingrow 2025-05-14 19:47:24 +08:00
parent 2c20d4eec6
commit 5235bbbcc3
2 changed files with 97 additions and 2 deletions

View File

@ -6,6 +6,40 @@
repo: '{{ agent_repository_url }}'
dest: /home/jingrow/agent/repo
remote: upstream
depth: 1
force: yes
clone: yes
update: no
register: clone_result
retries: 3
delay: 10
until: clone_result is success
ignore_errors: yes
- name: Clean up failed clone
become: yes
become_user: jingrow
file:
path: /home/jingrow/agent/repo
state: absent
when: clone_result is failed
ignore_errors: yes
- name: Verify clone success
become: yes
become_user: jingrow
stat:
path: /home/jingrow/agent/repo/.git
register: git_check
retries: 3
delay: 5
until: git_check.stat.exists
when: clone_result is success
- name: Fail if clone not successful
fail:
msg: "Failed to clone agent repository after multiple attempts"
when: clone_result is failed or (clone_result is success and not git_check.stat.exists)
- name: Install Agent
become: yes

View File

@ -81,7 +81,68 @@
- name: Get Docker Info
command: docker info
- name: Restart Agent processes
- name: Stop Agent processes
supervisorctl:
name: "agent:"
state: restarted
state: stopped
register: stop_result
retries: 3
delay: 5
until: stop_result is success
ignore_errors: yes
- name: Wait for Agent processes to stop
shell: |
for i in $(seq 1 30); do
if ! supervisorctl status agent: | grep -q "RUNNING"; then
exit 0
fi
sleep 1
done
exit 1
register: wait_result
changed_when: false
when: stop_result is success
- name: Start Agent processes in order
supervisorctl:
name: "{{ item }}"
state: started
loop:
- agent:redis
- agent:web
- agent:worker-0
- agent:worker-1
register: start_result
retries: 3
delay: 5
until: start_result is success
ignore_errors: yes
- name: Wait for Agent processes to be ready
shell: |
for i in $(seq 1 30); do
if supervisorctl status agent: | grep -q "RUNNING" && ! supervisorctl status agent: | grep -q "STOPPED\|FATAL\|EXITED"; then
exit 0
fi
sleep 1
done
exit 1
register: wait_ready_result
changed_when: false
when: start_result is success
- name: Verify Agent processes status
supervisorctl:
name: "agent:"
state: started
register: verify_result
retries: 3
delay: 5
until: verify_result is success
ignore_errors: yes
- name: Fail if Agent processes not running
fail:
msg: "Agent processes failed to start properly"
when: verify_result is failed