--- - name: "STRESS-SETUP: Unified control for stress test processes" hosts: "{{ 'master' if action in master_only_actions else 'all' }}" gather_facts: no vars: # Default action is status check action: "status" graceful_shutdown_timeout_seconds: 30 setup_policy: "policies/6_profile_setup_policy.yaml" enforcer_policy: "policies/8_unified_simulation_enforcer.yaml" master_only_actions: - "check-enforcer" - "profile-status" - "run-command" - "cleanup-profiles" - "restart-monitoring" vars_files: - "group_vars/all/vault.yml" pre_tasks: - name: Set inventory_env fact ansible.builtin.set_fact: inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" when: action not in master_only_actions or inventory_hostname in groups['master'] - name: Load environment-specific variables ansible.builtin.include_vars: "{{ item }}" with_fileglob: - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" when: action not in master_only_actions or inventory_hostname in groups['master'] tasks: - name: Define base directory for node ansible.builtin.set_fact: base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" when: action not in master_only_actions or inventory_hostname in groups['master'] - name: Check running processes (status action) ansible.builtin.shell: cmd: | echo "=== Process status on {{ inventory_hostname }} ===" echo "1. Tmux sessions:" tmux list-sessions 2>/dev/null || echo "No tmux sessions" echo "" echo "2. ytops-client processes:" ps aux | grep -E "ytops-client.*(profile|policy-enforcer|stress-policy)" | grep -v grep || echo "No ytops-client processes" echo "" echo "3. Python processes related to stress test:" ps aux | grep -E "python.*(ytops|stress)" | grep -v grep || echo "No related python processes" register: status_output changed_when: false when: action == "status" - name: Display status ansible.builtin.debug: msg: "{{ status_output.stdout_lines }}" when: action == "status" - name: Check enforcer status (check-enforcer action) block: - name: Check for enforcer tmux session and process ansible.builtin.shell: cmd: | echo "Enforcer status on {{ inventory_hostname }}:" if tmux has-session -t stress-enforcer 2>/dev/null; then echo " - Tmux session 'stress-enforcer' is RUNNING." ps aux | grep -E "ytops-client.*policy-enforcer" | grep -v grep || echo " - WARNING: Tmux session exists, but no matching process found." else echo " - Tmux session 'stress-enforcer' is NOT RUNNING." fi register: enforcer_status changed_when: false - name: Display enforcer status ansible.builtin.debug: msg: "{{ enforcer_status.stdout_lines }}" when: action == "check-enforcer" delegate_to: "{{ groups['master'][0] }}" run_once: true - name: Run ytops-client profile list on master (profile-status action) ansible.builtin.shell: cmd: | cd {{ airflow_master_dir }} if [ -f .env ]; then set -a && . ./.env && set +a fi timeout 10 ./bin/ytops-client profile list 2>&1 register: profile_list_output changed_when: false when: - action == "profile-status" - inventory_hostname in groups['master'] delegate_to: "{{ groups['master'][0] }}" run_once: true - name: Show profile list output lines ansible.builtin.debug: var: profile_list_output.stdout_lines when: - action == "profile-status" - profile_list_output is defined - inventory_hostname in groups['master'] - name: Run custom ytops-client command on master (run-command action) ansible.builtin.shell: cmd: | cd {{ airflow_master_dir }} if [ -f .env ]; then set -a && . ./.env && set +a fi {{ command_to_run }} register: command_output changed_when: false when: - action == "run-command" - inventory_hostname in groups['master'] - command_to_run is defined delegate_to: "{{ groups['master'][0] }}" run_once: true - name: Display command output ansible.builtin.debug: msg: "Command output:" when: - action == "run-command" - command_output is defined - name: Show command output lines ansible.builtin.debug: var: command_output.stdout_lines when: - action == "run-command" - command_output is defined - name: "Display policy being used for profile cleanup" ansible.builtin.debug: msg: >- {% if cleanup_mode | default('ungrouped') == 'full' %} Performing a FULL cleanup of all profiles using policy: {{ setup_policy }} {% else %} Cleaning up UNGROUPED profiles using policy: {{ setup_policy }}{% if dry_run | default(false) %} (DRY RUN){% endif %} {% endif %} when: - action == "cleanup-profiles" - inventory_hostname in groups['master'] delegate_to: "{{ groups['master'][0] }}" run_once: true - name: Cleanup profiles on master (cleanup-profiles action) ansible.builtin.shell: cmd: | cd {{ airflow_master_dir }} if [ -f .env ]; then set -a && . ./.env && set +a fi {% if cleanup_mode | default('ungrouped') == 'full' %} ./bin/ytops-client setup-profiles \ --policy {{ setup_policy }} \ --cleanup-all {% else %} ./bin/ytops-client profile cleanup-ungrouped \ --policy-file {{ setup_policy }} \ {% if dry_run | default(false) %}--dry-run{% endif %} {% endif %} register: cleanup_output changed_when: false when: - action == "cleanup-profiles" - inventory_hostname in groups['master'] delegate_to: "{{ groups['master'][0] }}" run_once: true - name: Display cleanup output ansible.builtin.debug: msg: "Cleanup output:" when: - action == "cleanup-profiles" - cleanup_output is defined - name: Show cleanup output lines ansible.builtin.debug: var: cleanup_output.stdout_lines when: - action == "cleanup-profiles" - cleanup_output is defined - name: Stop all stress test processes on all nodes (stop-all action) vars: process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops" block: - name: "Get PIDs of running stress test processes" ansible.builtin.shell: cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'" register: pids_to_kill changed_when: false - name: "Gracefully terminate stress test processes" ansible.builtin.shell: cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}" when: pids_to_kill.stdout | length > 0 ignore_errors: true changed_when: false - name: "Wait for graceful shutdown" ansible.builtin.pause: seconds: "{{ graceful_shutdown_timeout_seconds }}" when: pids_to_kill.stdout | length > 0 - name: "Force kill any lingering stress test processes" ansible.builtin.shell: cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true" ignore_errors: true changed_when: false - name: "Kill all stress-related tmux sessions as a failsafe" ansible.builtin.shell: cmd: | for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do tmux kill-session -t "$session" done || true # Failsafe: kill any lingering tmux server processes for stress sessions TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}') if [ -n "$TMUX_PIDS_TO_KILL" ]; then kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true fi ignore_errors: yes changed_when: false when: action == "stop-all" - name: Stop processes on targeted nodes only (stop-nodes action) vars: process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops" block: - name: "Get PIDs of running stress test processes" ansible.builtin.shell: cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'" register: pids_to_kill changed_when: false - name: "Gracefully terminate stress test processes" ansible.builtin.shell: cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}" when: pids_to_kill.stdout | length > 0 ignore_errors: true changed_when: false - name: "Wait for graceful shutdown" ansible.builtin.pause: seconds: "{{ graceful_shutdown_timeout_seconds }}" when: pids_to_kill.stdout | length > 0 - name: "Force kill any lingering stress test processes" ansible.builtin.shell: cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true" ignore_errors: true changed_when: false - name: "Kill all stress-related tmux sessions as a failsafe" ansible.builtin.shell: cmd: | for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do tmux kill-session -t "$session" done || true # Failsafe: kill any lingering tmux server processes for stress sessions TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}') if [ -n "$TMUX_PIDS_TO_KILL" ]; then kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true fi ignore_errors: yes changed_when: false when: action == "stop-nodes" - name: Restart monitoring and enforcer (restart-monitoring action) block: - name: Stop monitor process ansible.builtin.include_tasks: file: manage-processes-tasks.yml vars: tmux_session_name: "stress-monitor" process_grep_pattern: "ytops-client.*profile.*list" stop_process: true - name: Stop enforcer process ansible.builtin.include_tasks: file: manage-processes-tasks.yml vars: tmux_session_name: "stress-enforcer" process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}" stop_process: true - name: Start monitor process ansible.builtin.include_tasks: file: manage-processes-tasks.yml vars: tmux_session_name: "stress-monitor" working_dir: "{{ airflow_master_dir }}" command_to_run: > ./bin/ytops-client profile list --live --no-blink --show-reasons process_grep_pattern: "ytops-client.*profile.*list" start_process: true - name: Start enforcer process ansible.builtin.include_tasks: file: manage-processes-tasks.yml vars: tmux_session_name: "stress-enforcer" working_dir: "{{ airflow_master_dir }}" command_to_run: > ./bin/ytops-client policy-enforcer --policy {{ enforcer_policy }} --live process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}" start_process: true when: - action == "restart-monitoring" - inventory_hostname == groups['master'][0]