317 lines
12 KiB
YAML
317 lines
12 KiB
YAML
---
|
|
- name: "STRESS-SETUP: Unified control for stress test processes"
|
|
hosts: "{{ 'master' if action in master_only_actions else 'all' }}"
|
|
gather_facts: no
|
|
vars:
|
|
# Default action is status check
|
|
action: "status"
|
|
graceful_shutdown_timeout_seconds: 30
|
|
setup_policy: "policies/6_profile_setup_policy.yaml"
|
|
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
|
|
master_only_actions:
|
|
- "check-enforcer"
|
|
- "profile-status"
|
|
- "run-command"
|
|
- "cleanup-profiles"
|
|
- "restart-monitoring"
|
|
vars_files:
|
|
- "group_vars/all/vault.yml"
|
|
pre_tasks:
|
|
- name: Set inventory_env fact
|
|
ansible.builtin.set_fact:
|
|
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
|
|
when: action not in master_only_actions or inventory_hostname in groups['master']
|
|
- name: Load environment-specific variables
|
|
ansible.builtin.include_vars: "{{ item }}"
|
|
with_fileglob:
|
|
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
|
|
when: action not in master_only_actions or inventory_hostname in groups['master']
|
|
tasks:
|
|
- name: Define base directory for node
|
|
ansible.builtin.set_fact:
|
|
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
|
|
when: action not in master_only_actions or inventory_hostname in groups['master']
|
|
|
|
- name: Check running processes (status action)
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
echo "=== Process status on {{ inventory_hostname }} ==="
|
|
echo "1. Tmux sessions:"
|
|
tmux list-sessions 2>/dev/null || echo "No tmux sessions"
|
|
echo ""
|
|
echo "2. ytops-client processes:"
|
|
ps aux | grep -E "ytops-client.*(profile|policy-enforcer|stress-policy)" | grep -v grep || echo "No ytops-client processes"
|
|
echo ""
|
|
echo "3. Python processes related to stress test:"
|
|
ps aux | grep -E "python.*(ytops|stress)" | grep -v grep || echo "No related python processes"
|
|
register: status_output
|
|
changed_when: false
|
|
when: action == "status"
|
|
|
|
- name: Display status
|
|
ansible.builtin.debug:
|
|
msg: "{{ status_output.stdout_lines }}"
|
|
when: action == "status"
|
|
|
|
- name: Check enforcer status (check-enforcer action)
|
|
block:
|
|
- name: Check for enforcer tmux session and process
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
echo "Enforcer status on {{ inventory_hostname }}:"
|
|
if tmux has-session -t stress-enforcer 2>/dev/null; then
|
|
echo " - Tmux session 'stress-enforcer' is RUNNING."
|
|
ps aux | grep -E "ytops-client.*policy-enforcer" | grep -v grep || echo " - WARNING: Tmux session exists, but no matching process found."
|
|
else
|
|
echo " - Tmux session 'stress-enforcer' is NOT RUNNING."
|
|
fi
|
|
register: enforcer_status
|
|
changed_when: false
|
|
- name: Display enforcer status
|
|
ansible.builtin.debug:
|
|
msg: "{{ enforcer_status.stdout_lines }}"
|
|
when: action == "check-enforcer"
|
|
delegate_to: "{{ groups['master'][0] }}"
|
|
run_once: true
|
|
|
|
- name: Run ytops-client profile list on master (profile-status action)
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
cd {{ airflow_master_dir }}
|
|
if [ -f .env ]; then
|
|
set -a && . ./.env && set +a
|
|
fi
|
|
timeout 10 ./bin/ytops-client profile list 2>&1
|
|
register: profile_list_output
|
|
changed_when: false
|
|
when:
|
|
- action == "profile-status"
|
|
- inventory_hostname in groups['master']
|
|
delegate_to: "{{ groups['master'][0] }}"
|
|
run_once: true
|
|
|
|
- name: Show profile list output lines
|
|
ansible.builtin.debug:
|
|
var: profile_list_output.stdout_lines
|
|
when:
|
|
- action == "profile-status"
|
|
- profile_list_output is defined
|
|
- inventory_hostname in groups['master']
|
|
|
|
- name: Run custom ytops-client command on master (run-command action)
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
cd {{ airflow_master_dir }}
|
|
if [ -f .env ]; then
|
|
set -a && . ./.env && set +a
|
|
fi
|
|
{{ command_to_run }}
|
|
register: command_output
|
|
changed_when: false
|
|
when:
|
|
- action == "run-command"
|
|
- inventory_hostname in groups['master']
|
|
- command_to_run is defined
|
|
delegate_to: "{{ groups['master'][0] }}"
|
|
run_once: true
|
|
|
|
- name: Display command output
|
|
ansible.builtin.debug:
|
|
msg: "Command output:"
|
|
when:
|
|
- action == "run-command"
|
|
- command_output is defined
|
|
|
|
- name: Show command output lines
|
|
ansible.builtin.debug:
|
|
var: command_output.stdout_lines
|
|
when:
|
|
- action == "run-command"
|
|
- command_output is defined
|
|
|
|
- name: "Display policy being used for profile cleanup"
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
{% if cleanup_mode | default('ungrouped') == 'full' %}
|
|
Performing a FULL cleanup of all profiles using policy: {{ setup_policy }}
|
|
{% else %}
|
|
Cleaning up UNGROUPED profiles using policy: {{ setup_policy }}{% if dry_run | default(false) %} (DRY RUN){% endif %}
|
|
{% endif %}
|
|
when:
|
|
- action == "cleanup-profiles"
|
|
- inventory_hostname in groups['master']
|
|
delegate_to: "{{ groups['master'][0] }}"
|
|
run_once: true
|
|
|
|
- name: Cleanup profiles on master (cleanup-profiles action)
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
cd {{ airflow_master_dir }}
|
|
if [ -f .env ]; then
|
|
set -a && . ./.env && set +a
|
|
fi
|
|
{% if cleanup_mode | default('ungrouped') == 'full' %}
|
|
./bin/ytops-client setup-profiles \
|
|
--policy {{ setup_policy }} \
|
|
--cleanup-all
|
|
{% else %}
|
|
./bin/ytops-client profile cleanup-ungrouped \
|
|
--policy-file {{ setup_policy }} \
|
|
{% if dry_run | default(false) %}--dry-run{% endif %}
|
|
{% endif %}
|
|
register: cleanup_output
|
|
changed_when: false
|
|
when:
|
|
- action == "cleanup-profiles"
|
|
- inventory_hostname in groups['master']
|
|
delegate_to: "{{ groups['master'][0] }}"
|
|
run_once: true
|
|
|
|
- name: Display cleanup output
|
|
ansible.builtin.debug:
|
|
msg: "Cleanup output:"
|
|
when:
|
|
- action == "cleanup-profiles"
|
|
- cleanup_output is defined
|
|
|
|
- name: Show cleanup output lines
|
|
ansible.builtin.debug:
|
|
var: cleanup_output.stdout_lines
|
|
when:
|
|
- action == "cleanup-profiles"
|
|
- cleanup_output is defined
|
|
|
|
- name: Stop all stress test processes on all nodes (stop-all action)
|
|
vars:
|
|
process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
|
|
block:
|
|
- name: "Get PIDs of running stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
|
|
register: pids_to_kill
|
|
changed_when: false
|
|
|
|
- name: "Gracefully terminate stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
|
|
when: pids_to_kill.stdout | length > 0
|
|
ignore_errors: true
|
|
changed_when: false
|
|
|
|
- name: "Wait for graceful shutdown"
|
|
ansible.builtin.pause:
|
|
seconds: "{{ graceful_shutdown_timeout_seconds }}"
|
|
when: pids_to_kill.stdout | length > 0
|
|
|
|
- name: "Force kill any lingering stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
|
|
ignore_errors: true
|
|
changed_when: false
|
|
|
|
- name: "Kill all stress-related tmux sessions as a failsafe"
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
|
|
tmux kill-session -t "$session"
|
|
done || true
|
|
# Failsafe: kill any lingering tmux server processes for stress sessions
|
|
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
|
|
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
|
|
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
|
|
fi
|
|
ignore_errors: yes
|
|
changed_when: false
|
|
when: action == "stop-all"
|
|
|
|
- name: Stop processes on targeted nodes only (stop-nodes action)
|
|
vars:
|
|
process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
|
|
block:
|
|
- name: "Get PIDs of running stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
|
|
register: pids_to_kill
|
|
changed_when: false
|
|
|
|
- name: "Gracefully terminate stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
|
|
when: pids_to_kill.stdout | length > 0
|
|
ignore_errors: true
|
|
changed_when: false
|
|
|
|
- name: "Wait for graceful shutdown"
|
|
ansible.builtin.pause:
|
|
seconds: "{{ graceful_shutdown_timeout_seconds }}"
|
|
when: pids_to_kill.stdout | length > 0
|
|
|
|
- name: "Force kill any lingering stress test processes"
|
|
ansible.builtin.shell:
|
|
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
|
|
ignore_errors: true
|
|
changed_when: false
|
|
|
|
- name: "Kill all stress-related tmux sessions as a failsafe"
|
|
ansible.builtin.shell:
|
|
cmd: |
|
|
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
|
|
tmux kill-session -t "$session"
|
|
done || true
|
|
# Failsafe: kill any lingering tmux server processes for stress sessions
|
|
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
|
|
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
|
|
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
|
|
fi
|
|
ignore_errors: yes
|
|
changed_when: false
|
|
when: action == "stop-nodes"
|
|
|
|
- name: Restart monitoring and enforcer (restart-monitoring action)
|
|
block:
|
|
- name: Stop monitor process
|
|
ansible.builtin.include_tasks:
|
|
file: manage-processes-tasks.yml
|
|
vars:
|
|
tmux_session_name: "stress-monitor"
|
|
process_grep_pattern: "ytops-client.*profile.*list"
|
|
stop_process: true
|
|
|
|
- name: Stop enforcer process
|
|
ansible.builtin.include_tasks:
|
|
file: manage-processes-tasks.yml
|
|
vars:
|
|
tmux_session_name: "stress-enforcer"
|
|
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
|
|
stop_process: true
|
|
|
|
- name: Start monitor process
|
|
ansible.builtin.include_tasks:
|
|
file: manage-processes-tasks.yml
|
|
vars:
|
|
tmux_session_name: "stress-monitor"
|
|
working_dir: "{{ airflow_master_dir }}"
|
|
command_to_run: >
|
|
./bin/ytops-client profile list
|
|
--live
|
|
--no-blink
|
|
--show-reasons
|
|
process_grep_pattern: "ytops-client.*profile.*list"
|
|
start_process: true
|
|
|
|
- name: Start enforcer process
|
|
ansible.builtin.include_tasks:
|
|
file: manage-processes-tasks.yml
|
|
vars:
|
|
tmux_session_name: "stress-enforcer"
|
|
working_dir: "{{ airflow_master_dir }}"
|
|
command_to_run: >
|
|
./bin/ytops-client policy-enforcer
|
|
--policy {{ enforcer_policy }}
|
|
--live
|
|
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
|
|
start_process: true
|
|
when:
|
|
- action == "restart-monitoring"
|
|
- inventory_hostname == groups['master'][0]
|