Updates on single source of profile list in cluster.yaml, fix single tmux worker naming, updates ytops to support workers without profile list

2026-01-16 11:01:35 +03:00 · 2026-01-16 11:01:35 +03:00 · 4fd9217c6d
commit 4fd9217c6d
parent bf12118b2b
24 changed files with 2794 additions and 1112 deletions
--- a/ansible/playbook-README.md
+++ b/ansible/playbook-README.md
@ -108,12 +108,20 @@ ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.gree
 ### Profile Management
 The `cleanup-profiles` action can be used to remove profiles from Redis. By default, it cleans up "ungrouped" profiles.
 ```bash
-# Clean up all profiles
+# Perform a dry run of cleaning up ungrouped profiles
 ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "dry_run=true"
 # Clean up ungrouped profiles
 ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles"
-# Clean up specific profile prefix
+# To clean up ALL profiles (destructive), set cleanup_mode=full
-ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1"
+ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "cleanup_mode=full"
 # You can specify a custom setup policy file for cleanup operations
 ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "setup_policy=policies/my_custom_setup_policy.yaml"
 ```
 ## Monitoring and Inspection
@ -157,7 +165,19 @@ Then, from the jump host, you can sync code or policies to the cluster nodes:
 ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
 # Sync only policies and CLI configs
-ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini
+ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini
 # To sync files from a custom source directory on the Ansible controller, use the 'source_base_dir' extra variable:
 ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini -e "source_base_dir=/path/to/my-custom-source"
 ```
 ### Docker Image Updates
 To update the `yt-dlp` docker image used by download simulators, run the following playbook. This builds the image locally on each worker node.
 ```bash
 # Build the yt-dlp docker image locally on each worker node
 ansible-playbook ansible/playbook-update-yt-dlp-docker.yml -i ansible/inventory.green.ini
 ```
 ### Adding a New Worker
@ -225,7 +245,10 @@ ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.
 ### Restart Enforcer and Monitoring
 ```bash
-# Restart monitoring and enforcer on master
+# Restart monitoring and enforcer on master using default policies
 ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring"
 # Restart using a custom enforcer policy
 ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" -e "enforcer_policy=policies/my_other_enforcer.yaml"
 ```
--- a/ansible/playbook-full-install.yml
+++ b/ansible/playbook-full-install.yml
@ -1,4 +1,35 @@
 ---
 # -------------------------------------------------------------------------------------------------
 # PHASE 0: Fix Python Dependencies
 # Ensures remote hosts have compatible Python libraries to prevent module failures.
 # -------------------------------------------------------------------------------------------------
 - name: "PHASE 0: Upgrade Python SSL libraries"
  hosts: all
  gather_facts: no
  tasks:
    - name: Attempt to upgrade pyOpenSSL, cryptography and urllib3
      ansible.builtin.shell:
        cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3"
      become: yes
      register: pip_upgrade_result
      changed_when: "'Successfully installed' in pip_upgrade_result.stdout"
      failed_when: false
    - name: Retry upgrade with --break-system-packages on specific error
      ansible.builtin.shell:
        cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3 --break-system-packages"
      become: yes
      register: pip_upgrade_result_retry
      changed_when: "'Successfully installed' in pip_upgrade_result_retry.stdout"
      when: pip_upgrade_result.rc != 0 and 'externally-managed-environment' in pip_upgrade_result.stderr
    - name: Fail if package upgrade did not succeed
      ansible.builtin.fail:
        msg: "Failed to upgrade Python packages after retry. Last error: {{ pip_upgrade_result_retry.stderr | default(pip_upgrade_result.stderr) }}"
      when: >
        pip_upgrade_result.rc != 0 and
        (pip_upgrade_result_retry is not defined or pip_upgrade_result_retry.rc != 0)
 # This playbook provides a complete installation for fresh nodes.
 # It can install either master or worker roles, or both on the same machine.
 #
@ -55,57 +86,10 @@
 # -------------------------------------------------------------------------------------------------
 # PHASE 4: Build yt-dlp Docker Image
-# Builds the yt-dlp container from bin/ directory
+# Builds the yt-dlp container on each worker node using the dedicated playbook.
 # -------------------------------------------------------------------------------------------------
- name: "PHASE 4: Build yt-dlp Docker image"
+- name: "PHASE 4: Build yt-dlp Docker image on workers"
-  hosts: all
+  import_playbook: playbook-update-yt-dlp-docker.yml
  gather_facts: no
  vars_files:
    - "group_vars/all/vault.yml"
  pre_tasks:
    - name: Set inventory_env fact
      ansible.builtin.set_fact:
        inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
    - name: Load environment-specific variables
      ansible.builtin.include_vars: "{{ item }}"
      with_fileglob:
        - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
  tasks:
    - name: Define base directory for node
      ansible.builtin.set_fact:
        base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}"
    - name: Ensure bin directory exists
      ansible.builtin.file:
        path: "{{ base_dir }}/bin"
        state: directory
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
        mode: '0755'
      become: yes
    - name: Check if Dockerfile exists in bin directory
      ansible.builtin.stat:
        path: "{{ base_dir }}/bin/Dockerfile"
      register: dockerfile_stat
    - name: Build yt-dlp Docker image if Dockerfile exists
      community.docker.docker_image:
        name: yt-dlp-custom
        tag: latest
        source: build
        build:
          path: "{{ base_dir }}/bin"
          pull: yes
        state: present
        force_source: yes
      become: yes
      when: dockerfile_stat.stat.exists
    - name: Display message if Dockerfile not found
      ansible.builtin.debug:
        msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build"
      when: not dockerfile_stat.stat.exists
 # -------------------------------------------------------------------------------------------------
 # PHASE 5: Sync Code and Install Dependencies
@ -148,6 +132,12 @@
      with_fileglob:
        - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
  tasks:
    - name: Install redis-tools
      ansible.builtin.apt:
        name: redis-tools
        state: present
      become: yes
    - name: Configure system performance and kernel settings
      ansible.builtin.copy:
        src: "configs/etc/sysctl.d/99-system-limits.conf"
@ -174,7 +164,7 @@
    - name: Template Docker Compose file for master services
      ansible.builtin.template:
-        src: templates/docker-compose.stress-master.j2
+        src: docker-compose.stress-master.j2
        dest: "{{ airflow_master_dir }}/docker-compose.stress.yml"
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
@ -184,9 +174,7 @@
    - name: Stop and remove existing containers before starting services
      ansible.builtin.shell:
        cmd: |
-          docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
+          docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
          docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
          docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
      become: yes
      changed_when: false
      ignore_errors: yes
@ -200,6 +188,14 @@
        remove_orphans: true
      become: yes
    - name: Wait for Redis service to be ready
      ansible.builtin.wait_for:
        host: localhost
        port: "{{ redis_port }}"
        delay: 5
        timeout: 60
      delegate_to: "{{ inventory_hostname }}"
    - name: Wait for MinIO service to be ready
      ansible.builtin.wait_for:
        host: "{{ hostvars[inventory_hostname].ansible_host }}"
@ -240,7 +236,7 @@
      register: mc_mb_result
      failed_when: >
        mc_mb_result.rc != 0 and
-        "already exists" not in mc_mb_result.stderr
+        "already own it" not in mc_mb_result.stderr
      changed_when: mc_mb_result.rc == 0
      environment:
        HOME: "/home/{{ ansible_user }}"
@ -264,7 +260,7 @@
  tasks:
    - name: Template Docker Compose file for worker services
      ansible.builtin.template:
-        src: templates/docker-compose.stress-master.j2
+        src: docker-compose.stress-master.j2
        dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
@ -274,9 +270,9 @@
    - name: Stop and remove existing containers before starting services
      ansible.builtin.shell:
        cmd: |
-          docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
+          docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
-          docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
+          docker ps -a --filter "name=redis-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
-          docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
+          docker ps -a --filter "name=minio-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
      become: yes
      changed_when: false
      ignore_errors: yes
--- a/ansible/playbook-stress-cleanup-info-jsons.yml
+++ b/ansible/playbook-stress-cleanup-info-jsons.yml
@ -0,0 +1,41 @@
 ---
 - name: "STRESS-CLEANUP: Remove info.json task files from workers"
  hosts: workers
  gather_facts: no
  vars_files:
    - "group_vars/all/vault.yml"
  pre_tasks:
    - name: Set inventory_env fact
      ansible.builtin.set_fact:
        inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
    - name: Load environment-specific variables
      ansible.builtin.include_vars: "{{ item }}"
      with_fileglob:
        - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
  tasks:
    - name: Define the directory to be cleaned
      ansible.builtin.set_fact:
        target_dir: "{{ airflow_worker_dir }}/run/docker_mount/info_json_tasks/direct_docker_simulation"
    - name: "Display directory being cleaned"
      ansible.builtin.debug:
        msg: "Cleaning directory: {{ target_dir }} on {{ inventory_hostname }}"
    - name: Remove the info_json_tasks directory
      ansible.builtin.file:
        path: "{{ target_dir }}"
        state: absent
      become: yes
    - name: Recreate the info_json_tasks directory
      ansible.builtin.file:
        path: "{{ target_dir }}"
        state: directory
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
        mode: '0755'
      become: yes
    - name: "Display cleanup completion"
      ansible.builtin.debug:
        msg: "Successfully cleaned and recreated {{ target_dir }} on {{ inventory_hostname }}"
--- a/ansible/playbook-stress-control.yml
+++ b/ansible/playbook-stress-control.yml
@ -1,10 +1,11 @@
 ---
 - name: "STRESS-SETUP: Unified control for stress test processes"
-  hosts: all
+  hosts: "{{ 'master' if action in master_only_actions else 'all' }}"
  gather_facts: no
  vars:
    # Default action is status check
    action: "status"
    graceful_shutdown_timeout_seconds: 30
    setup_policy: "policies/6_profile_setup_policy.yaml"
    enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
    master_only_actions:
@ -80,9 +81,7 @@
          if [ -f .env ]; then
            set -a && . ./.env && set +a
          fi
-          timeout 10 ./bin/ytops-client profile list \
+          timeout 10 ./bin/ytops-client profile list 2>&1
            --auth-env sim_auth \
            --download-env sim_download 2>&1
      register: profile_list_output
      changed_when: false
      when: 
@ -132,7 +131,12 @@
    - name: "Display policy being used for profile cleanup"
      ansible.builtin.debug:
-        msg: "Using setup policy for cleanup: {{ setup_policy }}"
+        msg: >-
          {% if cleanup_mode | default('ungrouped') == 'full' %}
          Performing a FULL cleanup of all profiles using policy: {{ setup_policy }}
          {% else %}
          Cleaning up UNGROUPED profiles using policy: {{ setup_policy }}{% if dry_run | default(false) %} (DRY RUN){% endif %}
          {% endif %}
      when:
        - action == "cleanup-profiles"
        - inventory_hostname in groups['master']
@ -146,9 +150,15 @@
          if [ -f .env ]; then
            set -a && . ./.env && set +a
          fi
          {% if cleanup_mode | default('ungrouped') == 'full' %}
          ./bin/ytops-client setup-profiles \
            --policy {{ setup_policy }} \
-            --cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %}
+            --cleanup-all
          {% else %}
          ./bin/ytops-client profile cleanup-ungrouped \
            --policy-file {{ setup_policy }} \
            {% if dry_run | default(false) %}--dry-run{% endif %}
          {% endif %}
      register: cleanup_output
      changed_when: false
      when:
@ -172,8 +182,34 @@
        - cleanup_output is defined
    - name: Stop all stress test processes on all nodes (stop-all action)
      vars:
        process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
      block:
-        - name: Kill all tmux sessions starting with 'stress-'
+        - name: "Get PIDs of running stress test processes"
          ansible.builtin.shell:
            cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
          register: pids_to_kill
          changed_when: false
        - name: "Gracefully terminate stress test processes"
          ansible.builtin.shell:
            cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
          when: pids_to_kill.stdout | length > 0
          ignore_errors: true
          changed_when: false
        - name: "Wait for graceful shutdown"
          ansible.builtin.pause:
            seconds: "{{ graceful_shutdown_timeout_seconds }}"
          when: pids_to_kill.stdout | length > 0
        - name: "Force kill any lingering stress test processes"
          ansible.builtin.shell:
            cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
          ignore_errors: true
          changed_when: false
        - name: "Kill all stress-related tmux sessions as a failsafe"
          ansible.builtin.shell:
            cmd: |
              for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
@ -186,27 +222,37 @@
              fi
          ignore_errors: yes
          changed_when: false
        - name: Kill all ytops-client and related python processes
          ansible.builtin.shell:
            cmd: |
              # Gracefully terminate processes by pattern
              ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
              ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
              sleep 1 # Wait for graceful shutdown
              # Force kill any remaining processes
              ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
              ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
          ignore_errors: yes
          changed_when: false
      when: action == "stop-all"
    - name: Stop processes on targeted nodes only (stop-nodes action)
      vars:
        process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
      block:
-        - name: Kill all tmux sessions starting with 'stress-' on this node
+        - name: "Get PIDs of running stress test processes"
          ansible.builtin.shell:
            cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
          register: pids_to_kill
          changed_when: false
        - name: "Gracefully terminate stress test processes"
          ansible.builtin.shell:
            cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
          when: pids_to_kill.stdout | length > 0
          ignore_errors: true
          changed_when: false
        - name: "Wait for graceful shutdown"
          ansible.builtin.pause:
            seconds: "{{ graceful_shutdown_timeout_seconds }}"
          when: pids_to_kill.stdout | length > 0
        - name: "Force kill any lingering stress test processes"
          ansible.builtin.shell:
            cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
          ignore_errors: true
          changed_when: false
        - name: "Kill all stress-related tmux sessions as a failsafe"
          ansible.builtin.shell:
            cmd: |
              for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
@ -219,22 +265,6 @@
              fi
          ignore_errors: yes
          changed_when: false
        - name: Kill all ytops-client and related python processes on this node
          ansible.builtin.shell:
            cmd: |
              # Gracefully terminate processes by pattern
              ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
              ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
              sleep 1 # Wait for graceful shutdown
              # Force kill any remaining processes
              ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
              ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
          ignore_errors: yes
          changed_when: false
      when: action == "stop-nodes"
    - name: Restart monitoring and enforcer (restart-monitoring action)
@ -263,8 +293,6 @@
            working_dir: "{{ airflow_master_dir }}"
            command_to_run: >
              ./bin/ytops-client profile list
              --auth-env sim_auth
              --download-env sim_download
              --live
              --no-blink
              --show-reasons
--- a/ansible/playbook-stress-download-simulation.yml
+++ b/ansible/playbook-stress-download-simulation.yml
@ -3,7 +3,7 @@
  hosts: workers
  gather_facts: no
  vars:
-    tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}"
+    tmux_session_download: "{{ 'stress-download-worker-' + (worker_num | string) if (profile_prefix | default('') == 'worker' and worker_num is defined) else 'stress-download-' + (profile_prefix | default('default')) | replace(',', '-') }}"
    download_policy: "policies/11_direct_docker_download_simulation.yaml"
  vars_files:
    - "group_vars/all/vault.yml"
@ -41,13 +41,14 @@
        command_to_run: >
          ./bin/ytops-client stress-policy
          --policy {{ download_policy }}
-          {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %}
+          {# {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} #}
          {% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %}
          {% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %}
-          {% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %}
+          {# --set 'execution_control.worker_pools=[{"profile_prefix": "{% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}", "workers": 1}]' #}
          {% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %}
-          --profile-prefix {{ profile_prefix }}
+          {% if profile_prefix == 'worker' %}--workers 1{% endif %}
-        process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}"
+          --profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}
        process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}{% if profile_prefix == 'worker' %}.*--workers 1{% endif %}.*--profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}"
        start_process: "{{ start_download | default(false) | bool }}"
        stop_process: "{{ stop_download | default(false) | bool }}"
        check_status: "{{ vars.check_status | default(false) | bool }}"
--- a/ansible/playbook-stress-init-redis.yml
+++ b/ansible/playbook-stress-init-redis.yml
@ -15,16 +15,28 @@
      with_fileglob:
        - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
  tasks:
-    - name: Check if Redis is running
+    - name: Wait for Redis to be available
      ansible.builtin.wait_for:
        host: localhost
        port: "{{ redis_port }}"
        timeout: 60
        delay: 5
      register: redis_port_check
      ignore_errors: yes
    - name: Check if Redis is running and responding to commands
      ansible.builtin.shell:
-        cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG"
+        cmd: "redis-cli -h localhost -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG"
      register: redis_check
      ignore_errors: yes
      changed_when: false
      retries: 3
      delay: 5
      until: redis_check.rc == 0
    - name: Ensure Redis is accessible
      ansible.builtin.fail:
-        msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}"
+        msg: "Redis is not accessible on master node. Please ensure Redis service is running on localhost:{{ redis_port }}"
      when: redis_check.rc != 0
    - name: Stop any running ytops-client processes on master
@ -44,7 +56,7 @@
            --policy {{ setup_policy }} \
            --cleanup-all
      environment:
-        REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}"
+        REDIS_HOST: "localhost"
        REDIS_PORT: "{{ redis_port }}"
        REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}"
      register: init_result
--- a/ansible/playbook-stress-install-deps.yml
+++ b/ansible/playbook-stress-install-deps.yml
@ -37,18 +37,52 @@
      ansible.builtin.set_fact:
        base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
-    - name: Install required Python packages from requirements.txt
+    - name: Attempt to install required Python packages from requirements.txt
      ansible.builtin.pip:
        requirements: "{{ base_dir }}/ytops_client/requirements.txt"
        extra_args: "--ignore-installed"
      become: yes
      register: pip_reqs_result
      failed_when: false
    - name: Retry installing requirements with break-system-packages
      ansible.builtin.pip:
        requirements: "{{ base_dir }}/ytops_client/requirements.txt"
        extra_args: "--ignore-installed"
      become: yes
      environment:
        PIP_BREAK_SYSTEM_PACKAGES: "1"
      register: pip_reqs_result_retry
      when: pip_reqs_result.failed and 'externally-managed-environment' in pip_reqs_result.msg
-    - name: Explicitly install the thrift package
+    - name: Fail if requirements installation did not succeed
      ansible.builtin.fail:
        msg: "Failed to install requirements after retry. Last error: {{ pip_reqs_result_retry.msg | default(pip_reqs_result.msg) }}"
      when: >
        pip_reqs_result.failed and
        (pip_reqs_result_retry is not defined or pip_reqs_result_retry.failed)
    - name: Attempt to explicitly install the thrift package
      ansible.builtin.pip:
        name: thrift
        extra_args: "--ignore-installed"
      become: yes
      register: pip_thrift_result
      failed_when: false
    - name: Retry installing thrift with break-system-packages
      ansible.builtin.pip:
        name: thrift
        extra_args: "--ignore-installed"
      become: yes
      environment:
        PIP_BREAK_SYSTEM_PACKAGES: "1"
      register: pip_thrift_result_retry
      when: pip_thrift_result.failed and 'externally-managed-environment' in pip_thrift_result.msg
    - name: Fail if thrift installation did not succeed
      ansible.builtin.fail:
        msg: "Failed to install thrift after retry. Last error: {{ pip_thrift_result_retry.msg | default(pip_thrift_result.msg) }}"
      when: >
        pip_thrift_result.failed and
        (pip_thrift_result_retry is not defined or pip_thrift_result_retry.failed)
--- a/ansible/playbook-stress-lifecycle.yml
+++ b/ansible/playbook-stress-lifecycle.yml
@ -4,9 +4,15 @@
  gather_facts: no
  vars:
    # Default action
-    action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download, stop-generator
+    action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download
    graceful_shutdown_timeout_seconds: 30
  tasks:
    - name: "Ensure profile_prefixes is a flat list of all prefixes from profile_pools"
      ansible.builtin.set_fact:
        profile_prefixes: "{{ profile_pools | map(attribute='prefixes') | flatten }}"
      when: profile_pools is defined
    - name: "Start all configured generators and simulators"
      when: action == "start"
      block:
@ -25,7 +31,7 @@
                --limit {{ inventory_hostname }}
                -e "start_generator=true"
                -e "profile_prefix={{ combined_prefixes }}"
-                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
+                -e "dummy_batch={{ dummy_batch | default(false) }}"
                {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
                {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
                {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -42,7 +48,7 @@
                --limit {{ inventory_hostname }}
                -e "start_generator=true"
                -e "profile_prefix={{ item }}"
-                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
+                -e "dummy_batch={{ dummy_batch | default(false) }}"
                {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
                {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
                {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -56,42 +62,14 @@
                label: "profile: {{ item }}"
              when: auth_workers_per_profile | default(0) | int > 0
-        - name: "Start download simulator(s)"
+        - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
-          when: profile_prefixes is defined and profile_prefixes | length > 0
+          ansible.builtin.set_fact:
-          block:
+            download_workers_total: "{{ auth_workers_total | default(0) }}"
-            - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
+            download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
              ansible.builtin.command: >-
                ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
                -i {{ inventory_file }}
                --limit {{ inventory_hostname }}
                -e "start_download=true"
                -e "profile_prefix={{ combined_prefixes }}"
                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
                {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
                {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
                {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
              delegate_to: localhost
              changed_when: true
              when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
-            - name: "Start parallel download simulators for each profile"
+        - name: "Start download simulator(s)"
-              ansible.builtin.command: >-
+          ansible.builtin.include_tasks: tasks/start-download-simulators.yml
-                ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
+          when: profile_prefixes is defined and profile_prefixes | length > 0
                -i {{ inventory_file }}
                --limit {{ inventory_hostname }}
                -e "start_download=true"
                -e "profile_prefix={{ item }}"
                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
                {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
                {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
                {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
              delegate_to: localhost
              changed_when: true
              loop: "{{ profile_prefixes }}"
              loop_control:
                loop_var: item
                label: "profile: {{ item }}"
              when: download_workers_per_profile | default(0) | int > 0
    - name: "Start only auth generators on workers"
      when: action == "start-auth"
@ -111,7 +89,7 @@
                --limit {{ inventory_hostname }}
                -e "start_generator=true"
                -e "profile_prefix={{ combined_prefixes }}"
-                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
+                -e "dummy_batch={{ dummy_batch | default(false) }}"
                {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
                {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
                {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -128,7 +106,7 @@
                --limit {{ inventory_hostname }}
                -e "start_generator=true"
                -e "profile_prefix={{ item }}"
-                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
+                -e "dummy_batch={{ dummy_batch | default(false) }}"
                {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
                {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
                {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -145,57 +123,33 @@
    - name: "Start only download simulators on workers"
      when: action == "start-download"
      block:
        - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
          ansible.builtin.set_fact:
            download_workers_total: "{{ auth_workers_total | default(0) }}"
            download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
        - name: "Set combined profile prefixes string"
          ansible.builtin.set_fact:
            combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
          when: profile_prefixes is defined and profile_prefixes | length > 0
        - name: "Start download simulator(s)"
          ansible.builtin.include_tasks: tasks/start-download-simulators.yml
          when: profile_prefixes is defined and profile_prefixes | length > 0
          block:
            - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
              ansible.builtin.command: >-
                ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
                -i {{ inventory_file }}
                --limit {{ inventory_hostname }}
                -e "start_download=true"
                -e "profile_prefix={{ combined_prefixes }}"
                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
                {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
                {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
                {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
              delegate_to: localhost
              changed_when: true
              when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
            - name: "Start parallel download simulators for each profile"
              ansible.builtin.command: >-
                ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
                -i {{ inventory_file }}
                --limit {{ inventory_hostname }}
                -e "start_download=true"
                -e "profile_prefix={{ item }}"
                {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
                {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
                {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
                {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
              delegate_to: localhost
              changed_when: true
              loop: "{{ profile_prefixes }}"
              loop_control:
                loop_var: item
                label: "profile: {{ item }}"
              when: download_workers_per_profile | default(0) | int > 0
-    - name: "Stop only auth generators on workers (via playbook call)"
+    - name: "Stop only auth generators on workers"
-      when: action == "stop-generator"
+      when: action == "stop-auth"
      block:
        - name: "Set combined profile prefixes string"
          ansible.builtin.set_fact:
            combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
          when: profile_prefixes is defined and profile_prefixes | length > 0
-        - name: "Stop single auth generator for profiles: {{ combined_prefixes | default('none') }}"
+        - name: "Gracefully stop auth generator(s) via playbook call"
          when: profile_prefixes is defined and profile_prefixes | length > 0
          block:
            - name: "Stop single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
              ansible.builtin.command: >-
                ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
                -i {{ inventory_file }}
@ -204,53 +158,55 @@
                -e "profile_prefix={{ combined_prefixes }}"
              delegate_to: localhost
              changed_when: true
-          when: profile_prefixes is defined and profile_prefixes | length > 0
+              when: (auth_workers_per_profile | default(0) | int == 0) and (auth_workers_total | default(0) | int > 0)
-    - name: "Stop only auth generators on workers"
+            - name: "Stop parallel auth generators for each profile"
-      when: action == "stop-auth"
+              ansible.builtin.command: >-
-      block:
+                ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-        - name: Kill all auth generator tmux sessions on this worker
+                -i {{ inventory_file }}
-          ansible.builtin.shell:
+                --limit {{ inventory_hostname }}
-            cmd: |
+                -e "stop_generator=true"
-              for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-auth-"); do
+                -e "profile_prefix={{ item }}"
-                tmux kill-session -t "$session"
+              delegate_to: localhost
-              done || true
+              changed_when: true
-          ignore_errors: yes
+              loop: "{{ profile_prefixes }}"
-          changed_when: false
+              loop_control:
-
+                loop_var: item
-        - name: Kill all ytops-client auth generator processes on this worker
+                label: "profile: {{ item }}"
-          ansible.builtin.shell:
+              when: auth_workers_per_profile | default(0) | int > 0
            cmd: |
              # Gracefully terminate
              ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
              sleep 0.5
              # Force kill
              ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
          ignore_errors: yes
          changed_when: false
    - name: "Stop only download simulators on workers"
      when: action == "stop-download"
      block:
-        - name: Kill all download simulator tmux sessions on this worker
+        - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
          ansible.builtin.set_fact:
            download_workers_total: "{{ auth_workers_total | default(0) }}"
            download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
        - name: "Set combined profile prefixes string"
          ansible.builtin.set_fact:
            combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
          when: profile_prefixes is defined and profile_prefixes | length > 0
        - name: "Stop single download simulator group"
          ansible.builtin.shell:
            cmd: |
-              for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-download-"); do
+              for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$'); do
                tmux kill-session -t "$session"
              done || true
          when: download_workers_total | default(0) | int > 0
          changed_when: true
          ignore_errors: yes
          changed_when: false
-        - name: Kill all ytops-client download simulator processes on this worker
+        - name: "Stop parallel download simulators for each profile"
-          ansible.builtin.shell:
+          ansible.builtin.command: "tmux kill-session -t stress-download-{{ item }}"
-            cmd: |
+          loop: "{{ profile_prefixes }}"
-              # Gracefully terminate
+          loop_control:
-              ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
+            loop_var: item
-              sleep 0.5
+            label: "profile: {{ item }}"
-              # Force kill
+          when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0)
-              ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
+          changed_when: true
          ignore_errors: yes
          changed_when: false
    - name: "Stop all worker generators and simulators"
      when: action == "stop"
--- a/ansible/playbook-stress-sync-policies.yml
+++ b/ansible/playbook-stress-sync-policies.yml
@ -3,7 +3,7 @@
  hosts: all
  gather_facts: no
  vars:
-    ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source"
+    ytops_source_dir: "{{ source_base_dir | default(playbook_dir + '/../ytops_client-source') }}"
  vars_files:
    - "group_vars/all/vault.yml"
  pre_tasks:
--- a/ansible/playbook-update-yt-dlp-docker.yml
+++ b/ansible/playbook-update-yt-dlp-docker.yml
@ -0,0 +1,30 @@
 ---
 - name: "STRESS-SETUP: Build and push yt-dlp docker image"
  hosts: workers
  gather_facts: no
  vars_files:
    - "group_vars/all/vault.yml"
  pre_tasks:
    - name: Set inventory_env fact
      ansible.builtin.set_fact:
        inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
    - name: Load environment-specific variables
      ansible.builtin.include_vars: "{{ item }}"
      with_fileglob:
        - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
  tasks:
    - name: "Build and push yt-dlp image on worker"
      ansible.builtin.shell:
        cmd: |
          cd {{ airflow_worker_dir }}
          if [ -f .env ]; then
            set -a && . ./.env && set +a
          fi
          ./bin/build-yt-dlp-image
      register: build_output
      changed_when: true
    - name: "Display build output"
      ansible.builtin.debug:
        var: build_output.stdout_lines
      when: build_output.stdout_lines is defined
--- a/ansible/tasks/start-download-simulators.yml
+++ b/ansible/tasks/start-download-simulators.yml
@ -0,0 +1,105 @@
 ---
 - name: "Scale down excess download workers if necessary"
  when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
  block:
    - name: "Find running download simulator tmux sessions for this group"
      ansible.builtin.shell:
        cmd: "tmux list-sessions -F '#{session_name}' 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$' || true"
      register: running_sessions
      changed_when: false
      ignore_errors: yes
    - name: "Identify excess download simulator sessions to stop"
      ansible.builtin.set_fact:
        excess_sessions_to_stop: "{{ excess_sessions_to_stop | default([]) + [item] }}"
      vars:
        worker_num_str: "{{ item | regex_replace('^stress-download-worker-', '') }}"
      when: worker_num_str is number and (worker_num_str | int > (download_workers_total | int))
      loop: "{{ running_sessions.stdout_lines }}"
      loop_control:
        label: "Identifying excess session: {{ item }}"
    - name: "Get PIDs for excess download workers"
      ansible.builtin.shell:
        cmd: |
          PANE_PID=$(tmux list-panes -s "{{ item }}" -F '#{pane_pid}' | head -n 1)
          if [ -n "$PANE_PID" ]; then
            pgrep -P "$PANE_PID" || true
          fi
      register: excess_pids_raw
      loop: "{{ excess_sessions_to_stop | default([]) }}"
      changed_when: false
      ignore_errors: true
    - name: "Set fact for PIDs to kill"
      ansible.builtin.set_fact:
        pids_to_kill_gracefully: "{{ excess_pids_raw.results | map(attribute='stdout') | reject('==', '') | list }}"
    - name: "Gracefully terminate excess download workers"
      ansible.builtin.shell:
        cmd: "kill {{ item }}"
      loop: "{{ pids_to_kill_gracefully | default([]) }}"
      when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
      ignore_errors: true
      changed_when: false
    - name: "Wait for graceful shutdown of excess workers"
      ansible.builtin.pause:
        seconds: "{{ graceful_shutdown_timeout_seconds }}"
      when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
    - name: "Force kill any lingering excess workers"
      ansible.builtin.shell:
        cmd: "kill -9 {{ item }}"
      loop: "{{ pids_to_kill_gracefully | default([]) }}"
      when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
      ignore_errors: true
      changed_when: false
    - name: "Kill tmux sessions for excess workers"
      ansible.builtin.shell:
        cmd: "tmux kill-session -t {{ item }}"
      loop: "{{ excess_sessions_to_stop | default([]) }}"
      when: excess_sessions_to_stop is defined and excess_sessions_to_stop | length > 0
      ignore_errors: true
      changed_when: false
 - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
  ansible.builtin.command: >-
    ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
    -i {{ inventory_file }}
    --limit {{ inventory_hostname }}
    -e "start_download=true"
    -e "profile_prefix=worker"
    -e "display_prefix={{ combined_prefixes }}"
    -e "worker_num={{ worker_num }}"
    {% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %}
    {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
    {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
    {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
  delegate_to: localhost
  changed_when: true
  when: download_workers_total | default(0) | int > 0
  loop: "{{ range(1, (download_workers_total | default(1) | int) + 1) | list }}"
  loop_control:
    loop_var: worker_num
    label: "worker {{ worker_num }}"
 - name: "Start parallel download simulators for each profile"
  ansible.builtin.command: >-
    ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
    -i {{ inventory_file }}
    --limit {{ inventory_hostname }}
    -e "start_download=true"
    -e "profile_prefix={{ item }}"
    {% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %}
    {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
    {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
    {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
  delegate_to: localhost
  changed_when: true
  loop: "{{ profile_prefixes }}"
  loop_control:
    loop_var: item
    label: "profile: {{ item }}"
  when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0)
--- a/ansible/templates/.env.stress.j2
+++ b/ansible/templates/.env.stress.j2
@ -15,3 +15,7 @@ AWS_REGION={{ vault_s3_delivery_aws_region }}
 ACCOUNT_ACTIVE_DURATION_MIN=7
 ACCOUNT_COOLDOWN_DURATION_MIN=30
 STRESS_POLICY_INBOX_QUEUE=dev_stress_inbox
 # --- Stress Test Environment Names ---
 YTOPS_AUTH_ENV={{ stress_auth_env | default('sim_auth') }}
 YTOPS_DOWNLOAD_ENV={{ stress_download_env | default('sim_download') }}
--- a/tools/generate-profile-setup-policy.py
+++ b/tools/generate-profile-setup-policy.py
@ -121,20 +121,214 @@ def generate_policy(cluster_config, output_path):
    print(f"Successfully generated profile setup policy at: {output_path}")
 def generate_enforcer_policy(cluster_config, output_path):
    """Generate the enforcer policy file."""
    all_workers = cluster_config.get('workers', {})
    enforcement_pools = []
    for worker_name, worker_config in sorted(all_workers.items()):
        all_prefixes = []
        for pool in worker_config.get('profile_pools', []):
            all_prefixes.extend(pool.get('prefixes', []))
        if not all_prefixes:
            continue
        pool_entry = OrderedDict([
            ('name', f"server_{worker_name}"),
            ('profile_group_patterns', sorted(list(set(all_prefixes)))),
            ('max_active_profiles', 1)
        ])
        enforcement_pools.append(pool_entry)
    with open(output_path, 'w') as f:
        f.write("# Policy for the unified simulation enforcer.\n")
        f.write("# This file is used by `bin/ytops-client policy-enforcer --live` to manage\n")
        f.write("# both the authentication and download simulation environments from a single process.\n\n")
        f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
        f.write("# !!! DO NOT EDIT. Your changes will be overwritten.              !!!\n")
        f.write("# !!! Edit cluster.green.yml and re-run the generator instead.    !!!\n\n")
        f.write("simulation_parameters:\n")
        f.write("  # --- Common Redis settings for all tools ---\n")
        f.write("  # The enforcer will connect to two different Redis environments (key prefixes)\n")
        f.write("  # based on these settings, applying the corresponding policies to each.\n")
        f.write('  env_file: ".env"\n')
        f.write('  auth_env: "sim_auth"\n')
        f.write('  download_env: "sim_download"\n')
        f.write("  \n")
        f.write("  # How often the enforcer should wake up and apply all policies.\n")
        f.write("  interval_seconds: 2\n\n")
        f.write("# --- Common & Pool-specific Settings ---\n")
        f.write("# Common settings are applied to all profile groups discovered via the pools below.\n")
        f.write("# A pool can optionally override these settings by defining its own 'group_settings' block.\n")
        f.write("common_group_settings:\n")
        f.write("  auth:\n")
        f.write("    max_active_profiles: 1\n")
        f.write("    rotate_after_requests: 5\n")
        f.write("    rest_duration_minutes_on_rotation: 0.20\n")
        f.write("    wait_download_finish_per_group: true\n")
        f.write("    max_wait_for_downloads_minutes: 240\n")
        f.write("  download:\n")
        f.write("    max_active_profiles: 1\n")
        f.write("    rotate_after_requests: 0\n")
        f.write("    rest_duration_minutes_on_rotation: 0.2\n\n")
        f.write("# Defines pools of profile groups with their own concurrency limits.\n")
        f.write("enforcement_pools:\n")
        for pool in enforcement_pools:
            f.write(f'  - name: "{pool["name"]}"\n')
            patterns_str = ", ".join([f'"{p}"' for p in pool['profile_group_patterns']])
            f.write(f'    profile_group_patterns: [{patterns_str}]\n')
            f.write(f'    max_active_profiles: {pool["max_active_profiles"]}\n')
    rest_of_file = """
 # --- Policies for the Authentication Simulation ---
 auth_policy_enforcer_config:
  # Ban if 2 failures occur within a 1-minute window.
  #ban_on_failures: 2
  #ban_on_failures_window_minutes: 1
  # The standard rest policy is disabled, as rotation is handled by the profile group.
  # New rate limit policy to enforce requests-per-hour limits.
  # For guest sessions, the limit is ~300 videos/hour.
  rate_limit_requests: 0
  rate_limit_window_minutes: 60
  rate_limit_rest_duration_minutes: 5
  rest_after_requests: 0
  rest_duration_minutes: 10
  # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
  # sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
  # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
  # The settings below should be configured to respect these limits.
  # New setting for load balancing across profile groups.
  # "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time).
  #                 This is a global FIFO strategy that effectively cycles through profiles regardless of their group.
  # "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads.
  #                 If multiple groups have zero pending downloads, it acts as a FIFO queue, activating
  #                 the one that finished its last download batch the earliest. This is useful when you want
  #                 to ensure a group finishes its entire workload before another group starts.
  profile_selection_strategy: "longest_idle"
  # The 'global_max_active_profiles' setting is now superseded by the per-pool limits
  # defined in the 'enforcement_pools' section.
  # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
  # The enforcer logic should be updated to read from there.
  proxy_work_minutes: 0
  proxy_rest_duration_minutes: 0
  # Global maximum time a proxy can be active before being rested, regardless of
  # other rules. Acts as a safety net. Set to 0 to disable.
  max_global_proxy_active_minutes: 0
  rest_duration_on_max_active: 10
  # Proxy-level ban on failure burst is disabled.
  proxy_ban_on_failures: 0
  proxy_ban_window_minutes: 2
  # Clean up locks held for more than 16 minutes (960s) to prevent stuck workers.
  # This should be longer than the docker container timeout (15m).
  unlock_stale_locks_after_seconds: 960
  # A short post-task cooldown for auth simulation profiles. When a batch is finished,
  # the profile is put into COOLDOWN briefly. This prevents a worker from immediately
  # re-locking the same profile, giving the policy enforcer a window to perform rotation.
  unlock_cooldown_seconds: 0
 # --- Cross-simulation synchronization ---
 # This section is simplified because the link between auth and download profiles
 # is now defined in the `profile_group_definitions`.
 cross_simulation_sync:
  # Which states to synchronize from auth to download.
  sync_states:
    - "BANNED"
  # If true, a BANNED state on an auth profile will force the download profile to also be BANNED.
  enforce_auth_lead: true
  # CRITICAL: Ensures the correct download profile GROUP is active.
  sync_active_profile: true
  # When an auth profile is in the 'waiting_downloads' state, ensure the matching download profile is active.
  sync_waiting_downloads: true
 # --- Policies for the Download Simulation ---
 download_policy_enforcer_config:
  # Ban if 1 failure occurs within a 1-minute window.
  ban_on_failures: 1
  ban_on_failures_window_minutes: 1
  # Standard rest policy is disabled in favor of group rotation.
  # New rate limit policy to enforce requests-per-hour limits.
  # For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe.
  rate_limit_requests: 280
  rate_limit_window_minutes: 60
  rate_limit_rest_duration_minutes: 5
  rest_after_requests: 0
  rest_duration_minutes: 20
  # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
  # sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
  # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
  # The settings below should be configured to respect these limits.
  # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
  # The enforcer logic should be updated to read from there.
  # Time-based proxy rules are disabled.
  proxy_work_minutes: 0
  proxy_rest_duration_minutes: 10
  # Global maximum time a proxy can be active before being rested, regardless of
  # other rules. Acts as a safety net. Set to 0 to disable.
  max_global_proxy_active_minutes: 0
  rest_duration_on_max_active: 10
  # Proxy-level ban on failure burst is disabled.
  proxy_ban_on_failures: 3
  proxy_ban_window_minutes: 1
  # Clean up download locks held for more than 16 minutes (960s) to allow for long downloads.
  # This should be longer than the docker container timeout (15m).
  unlock_stale_locks_after_seconds: 960
  # After a profile is used for a download, unlock it but put it in COOLDOWN
  # state for 2-3s. This is enforced by the worker, which reads this config from Redis.
  unlock_cooldown_seconds: [2, 3]
 """
    with open(output_path, 'a') as f:
        f.write(rest_of_file)
    print(f"Successfully generated enforcer policy at: {output_path}")
 def main():
-    if len(sys.argv) != 3:
+    if len(sys.argv) < 3 or len(sys.argv) > 4:
-        print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-policy-file>")
+        print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-profile-policy-file> [<output-enforcer-policy-file>]")
        sys.exit(1)
    config_path = sys.argv[1]
-    output_path = sys.argv[2]
+    profile_output_path = sys.argv[2]
    if not os.path.exists(config_path):
        print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr)
        sys.exit(1)
    cluster_config = load_cluster_config(config_path)
-    generate_policy(cluster_config, output_path)
+    generate_policy(cluster_config, profile_output_path)
    if len(sys.argv) == 4:
        enforcer_output_path = sys.argv[3]
        generate_enforcer_policy(cluster_config, enforcer_output_path)
 if __name__ == "__main__":
    main()
--- a/ytops_client-source/policies/12_queue_auth_simulation.yaml
+++ b/ytops_client-source/policies/12_queue_auth_simulation.yaml
@ -206,6 +206,18 @@ queue_policy:
  # Example: formats_to_download: ["140-dashy", "299-dashy"]
  formats_to_download: "from_download_policy"
  # Whether to report completion back to a queue. Always reported for auth.
  report_completion: true
  # Queue to report completion to
  completion_queue: "queue2_auth_completed"
  # Queue to report failures to
  failure_queue: "queue2_auth_fail"
  # Queue to report skipped tasks to
  skipped_queue: "queue2_auth_skipped"
 simulation_parameters:
  auth_env: "sim_auth"
  download_env: "sim_download"
--- a/ytops_client-source/policies/8_unified_simulation_enforcer.yaml
+++ b/ytops_client-source/policies/8_unified_simulation_enforcer.yaml
@ -13,13 +13,10 @@ simulation_parameters:
  # How often the enforcer should wake up and apply all policies.
  interval_seconds: 2
-# --- Dynamic Profile Group Templates ---
+# --- Common & Pool-specific Settings ---
-# The policy enforcer will find all profile prefixes matching a pattern in Redis
+# Common settings are applied to all profile groups discovered via the pools below.
-# and apply the settings from the matching template. This avoids having to list
+# A pool can optionally override these settings by defining its own 'group_settings' block.
-# every profile group manually.
+common_group_settings:
 # NOTE: The policy enforcer tool must be updated to support this format.
 profile_group_templates:
  - pattern: "user*"
  auth:
    max_active_profiles: 1
    rotate_after_requests: 5
@ -27,9 +24,19 @@ profile_group_templates:
    wait_download_finish_per_group: true
    max_wait_for_downloads_minutes: 240
  download:
    max_active_profiles: 1
    rotate_after_requests: 0
    rest_duration_minutes_on_rotation: 0.2
 # Defines pools of profile groups with their own concurrency limits.
 enforcement_pools:
  - name: "server_dl003_pool"
    profile_group_patterns: ["user31", "user32"]
    max_active_profiles: 1
  - name: "server_dl006_pool"
    profile_group_patterns: ["user61", "user62"]
    max_active_profiles: 1
 # --- Policies for the Authentication Simulation ---
 auth_policy_enforcer_config:
@ -62,9 +69,8 @@ auth_policy_enforcer_config:
  #                 to ensure a group finishes its entire workload before another group starts.
  profile_selection_strategy: "longest_idle"
-  # Enforce a total limit of active profiles across all groups defined below.
+  # The 'global_max_active_profiles' setting is now superseded by the per-pool limits
-  # Set to 1 to ensure only one group's profile is active at any time.
+  # defined in the 'enforcement_pools' section.
  global_max_active_profiles: 1
  # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
  # The enforcer logic should be updated to read from there.
@ -106,6 +112,7 @@ cross_simulation_sync:
 # --- Policies for the Download Simulation ---
 download_policy_enforcer_config:
  # Ban if 1 failure occurs within a 1-minute window.
  ban_on_failures: 1
  ban_on_failures_window_minutes: 1
--- a/ytops_client-source/policies/unused/10_direct_docker_auth_simulation.yaml
+++ b/ytops_client-source/policies/unused/10_direct_docker_auth_simulation.yaml
--- a/ytops_client-source/policies/unused/13_queue_download_simulation.yaml
+++ b/ytops_client-source/policies/unused/13_queue_download_simulation.yaml
--- a/ytops_client-source/ytops_client/policy_enforcer_tool.py
+++ b/ytops_client-source/ytops_client/policy_enforcer_tool.py
@ -68,6 +68,9 @@ class PolicyEnforcer:
        all_profiles_list = self.manager.list_profiles()
        all_profiles_map = {p['name']: p for p in all_profiles_list}
        # Sync profile states from their assigned proxy's state (e.g., if proxy is BANNED, ban profile).
        self.enforce_proxy_state_on_profiles(all_profiles_list, all_profiles_map)
        # Apply profile group policies (rotation, max_active). This will modify the local `all_profiles_map`.
        self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map, args)
@ -197,12 +200,26 @@ class PolicyEnforcer:
                live_active_counts[group_name] = count
        logger.debug(f"Initial live active counts: {live_active_counts}")
-        # --- New Global Max Active Logic ---
+        # --- New Enforcement Pool and Global Max Active Logic ---
        enforcement_pools = getattr(args, 'enforcement_pools', [])
        live_pool_active_counts = {}
        if enforcement_pools:
            for i, pool in enumerate(enforcement_pools):
                pool_name = pool.get('name', f'pool_{i}')
                live_pool_active_counts[pool_name] = 0
            for group_name, count in live_active_counts.items():
                group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
                pool_name = group_policy.get('pool_name')
                if pool_name:
                    live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + count
            logger.debug(f"Initial live pool active counts: {live_pool_active_counts}")
        global_max_active = getattr(args, 'global_max_active_profiles', 0)
        live_global_active_count = sum(live_active_counts.values())
        if global_max_active > 0:
            logger.debug(f"Enforcing global max active profiles limit of {global_max_active}. Current global active: {live_global_active_count}")
-        # --- End New Global Logic ---
+        # --- End New Logic ---
        # --- End group logic setup ---
@ -278,7 +295,7 @@ class PolicyEnforcer:
        # --- End new logic ---
        # --- New Sorting Logic based on Profile Selection Strategy ---
-        strategy = getattr(args, 'profile_selection_strategy', 'longest_idle')
+        strategy = getattr(args, 'profile_selection_strategy', None)
        if strategy == 'least_loaded' and profile_groups:
            logger.debug("Applying 'least_loaded' profile selection strategy.")
            # Separate profiles that are ready from those that are not
@ -345,10 +362,8 @@ class PolicyEnforcer:
            profiles_to_check = sorted_ready_profiles + not_ready_profiles
            logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
-        else: # Default 'longest_idle' sort
+        elif strategy == 'longest_idle':
-            if strategy not in ['longest_idle']:
+            logger.debug("Applying 'longest_idle' profile selection strategy.")
                logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'longest_idle'.")
            # Separate profiles that are ready to be activated from those still resting.
            # A profile waiting for downloads is NOT considered ready for activation.
            ready_profiles = [
@ -369,6 +384,11 @@ class PolicyEnforcer:
            # The final list to check will process all ready profiles first, then wait for the not-ready ones.
            profiles_to_check = ready_profiles + not_ready_profiles
            logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
        else: # Default sort (no strategy)
            if strategy: # Log a warning if an unknown strategy was provided
                logger.warning(f"Unknown profile_selection_strategy '{strategy}'. Using default FIFO sort by rest time.")
            # Default to a simple FIFO sort based on when their rest period ends.
            profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', ''))))
        # --- End New Sorting Logic ---
        # --- New logic: Identify groups with waiting profiles ---
@ -490,13 +510,30 @@ class PolicyEnforcer:
                profile_name = profile['name']
                group_name = profile_to_group_map.get(profile_name)
-                # --- New Global Max Active Check ---
+                # --- New Pool and Global Max Active Check ---
-                # This check prevents NEW profiles (in RESTING state) from becoming active if the global limit is reached.
+                is_new_activation = profile['state'] == ProfileState.RESTING.value
-                # It allows COOLDOWN profiles to become active, as they are already part of the active count.
+                if is_new_activation:
-                if global_max_active > 0 and live_global_active_count >= global_max_active and profile['state'] == ProfileState.RESTING.value:
+                    # Check pool limits first
                    if enforcement_pools and group_name:
                        group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
                        pool_name = group_policy.get('pool_name')
                        pool_config = None
                        for i, p in enumerate(enforcement_pools):
                            if p.get('name', f'pool_{i}') == pool_name:
                                pool_config = p
                                break
                        if pool_config:
                            pool_max_active = pool_config.get('max_active_profiles', 0)
                            current_pool_active = live_pool_active_counts.get(pool_name, 0)
                            if pool_max_active > 0 and current_pool_active >= pool_max_active:
                                logger.debug(f"Profile '{profile_name}' rest ended, but pool '{pool_name}' max active limit ({pool_max_active}) has been reached. Deferring activation.")
                                continue
                    # Then check global limit if it's still configured (for backward compatibility)
                    if global_max_active > 0 and live_global_active_count >= global_max_active:
                        logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.")
                        continue
-                # --- End New Global Check ---
+                # --- End New Check ---
                # --- Group-aware unrest check ---
                if group_name:
@ -577,11 +614,13 @@ class PolicyEnforcer:
                        continue # Skip activation for this profile
                # --- End group check ---
-                # Before activating, ensure the profile's proxy is not resting.
+                # Before activating, ensure the profile's proxy is not resting or banned.
                proxy_url = profile.get('proxy')
                if proxy_url:
                    proxy_state_data = proxy_states.get(proxy_url, {})
-                    if proxy_state_data.get('state') == ProfileState.RESTING.value:
+                    proxy_state = proxy_state_data.get('state')
                    if proxy_state == ProfileState.RESTING.value:
                        logger.debug(f"Profile '{profile['name']}' rest period ended, but its proxy '{proxy_url}' is still resting. Deferring activation.")
                        # Update reason for clarity in the UI when a profile is blocked by its proxy.
@ -596,10 +635,32 @@ class PolicyEnforcer:
                        continue # Do not activate this profile yet.
-                # Update group counter BEFORE making any changes, so subsequent checks in this cycle use the updated count
+                    elif proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value:
                        # This profile is about to be activated, but its proxy is banned. Ban it.
                        reason = f"Proxy '{proxy_url}' is BANNED"
                        logger.warning(f"Banning profile '{profile['name']}' because its proxy is banned: {reason}")
                        self.actions_taken_this_cycle += 1
                        if not self.dry_run:
                            sm = self.manager.get_state_machine(profile_name)
                            if sm:
                                sm.ban(reason=reason)
                        # Update local map
                        all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value
                        all_profiles_map[profile_name]['reason'] = reason
                        continue # Skip activation; it is now banned.
                # Update group and pool counters BEFORE making any changes, so subsequent checks in this cycle use the updated count
                if group_name and profile['state'] == ProfileState.RESTING.value:
                    # For RESTING profiles, they're becoming active, so increment the count
                    live_active_counts[group_name] = live_active_counts.get(group_name, 0) + 1
                    # Also increment the pool counter
                    if enforcement_pools:
                        group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
                        pool_name = group_policy.get('pool_name')
                        if pool_name:
                            live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + 1
                    # Also increment the global counter
                    if global_max_active > 0:
                        live_global_active_count += 1
@ -903,20 +964,68 @@ class PolicyEnforcer:
                    if num_active_or_locked == 0:
                         logger.debug(f"Group '{group_name}' has no active profiles. `enforce_unrest_policy` will attempt to activate one.")
-        # --- 4. Global Self-Healing: Enforce global_max_active_profiles ---
+        # --- 4. Pool and Global Self-Healing ---
-        # This runs after all per-group healing and ensures the global limit is respected.
+        enforcement_pools = getattr(args, 'enforcement_pools', [])
        if enforcement_pools:
            for i, pool in enumerate(enforcement_pools):
                pool_name = pool.get('name', f'pool_{i}')
                pool_max_active = pool.get('max_active_profiles', 0)
                if not pool_max_active or pool_max_active <= 0:
                    continue
                # Get all profile names belonging to this pool
                pool_profile_names = set()
                for group in profile_groups:
                    if group.get('pool_name') == pool_name:
                        prefix = group.get('prefix')
                        if prefix:
                            for p_name in all_profiles_map:
                                if p_name.startswith(prefix):
                                    pool_profile_names.add(p_name)
                # Get current active count for this pool from our local map
                current_pool_active = [
                    p for name, p in all_profiles_map.items()
                    if name in pool_profile_names and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]
                ]
                num_pool_active = len(current_pool_active)
                if num_pool_active > pool_max_active:
                    logger.warning(f"Pool Healing ('{pool_name}'): Found {num_pool_active} active profiles, but pool max is {pool_max_active}. Resting excess.")
                    profiles_that_can_be_rested = [p for p in current_pool_active if p['state'] == ProfileState.ACTIVE.value]
                    profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True)
                    profiles_that_can_be_rested.sort(key=lambda p: (
                        p.get('success_count', 0) + p.get('failure_count', 0) +
                        p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)
                    ), reverse=True)
                    num_to_rest = num_pool_active - pool_max_active
                    for profile in profiles_that_can_be_rested[:num_to_rest]:
                        logger.warning(f"Pool Healing ('{pool_name}'): Resting profile '{profile['name']}'.")
                        self.actions_taken_this_cycle += 1
                        if not self.dry_run:
                            sm = self.manager.get_state_machine(profile['name'])
                            if sm:
                                sm.rest(reason=f"Pool '{pool_name}' max_active healing", duration_minutes=0.02)
                        all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value
                        all_profiles_map[profile['name']]['rest_reason'] = f"Pool '{pool_name}' max_active healing"
                        all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60)
        # For backward compatibility, also enforce global_max_active_profiles if it is set.
        global_max_active = getattr(args, 'global_max_active_profiles', 0)
        if global_max_active > 0:
            # Get all profiles managed by any group
            all_grouped_profiles = set()
            for group in profile_groups:
-                profiles_in_group = set()
+                if 'prefix' in group:
                if 'profiles' in group:
                    profiles_in_group = set(group['profiles'])
                elif 'prefix' in group:
                    prefix = group['prefix']
-                    profiles_in_group = {p['name'] for p in all_profiles_list if p['name'].startswith(prefix)}
+                    for p_name in all_profiles_map:
-                all_grouped_profiles.update(profiles_in_group)
+                        if p_name.startswith(prefix):
                            all_grouped_profiles.add(p_name)
                elif 'profiles' in group:
                    all_grouped_profiles.update(group['profiles'])
            # Get current active count across all groups from our local map
            current_global_active = [
@ -928,29 +1037,23 @@ class PolicyEnforcer:
            if num_global_active > global_max_active:
                logger.warning(f"Global Healing: Found {num_global_active} active profiles across all groups, but global max is {global_max_active}. Resting excess.")
                # We can only rest profiles that are in the ACTIVE state, not LOCKED.
                profiles_that_can_be_rested = [p for p in current_global_active if p['state'] == ProfileState.ACTIVE.value]
-
+                profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True)
                # Sort to determine which profiles to rest, using the same logic as per-group healing.
                profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) # Higher name first
                profiles_that_can_be_rested.sort(key=lambda p: (
                    p.get('success_count', 0) + p.get('failure_count', 0) +
                    p.get('tolerated_error_count', 0) +
                    p.get('download_count', 0) + p.get('download_error_count', 0)
-                ), reverse=True) # Most requests first
+                ), reverse=True)
                num_to_rest = num_global_active - global_max_active
-                profiles_to_rest = profiles_that_can_be_rested[:num_to_rest]
+                for profile in profiles_that_can_be_rested[:num_to_rest]:
                for profile in profiles_to_rest:
                    logger.warning(f"Global Healing: Resting profile '{profile['name']}'.")
                    self.actions_taken_this_cycle += 1
                    if not self.dry_run:
                        sm = self.manager.get_state_machine(profile['name'])
                        if sm:
-                            # Rest for a minimal duration to prevent immediate re-activation in the same cycle.
+                            sm.rest(reason="Global max_active healing", duration_minutes=0.02)
                            sm.rest(reason="Global max_active healing", duration_minutes=0.02) # ~1.2 seconds
                    # Update local map to reflect the change for this cycle
                    all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value
                    all_profiles_map[profile['name']]['rest_reason'] = "Global max_active healing"
                    all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60)
@ -1062,6 +1165,10 @@ class PolicyEnforcer:
        for proxy_url, state_data in proxy_states.items():
            state = state_data.get('state', ProfileState.ACTIVE.value)
            if state == ProfileState.BANNED.value:
                logger.debug(f"Proxy '{proxy_url}' is BANNED. Skipping work/rest cycle enforcement.")
                continue
            # Un-rest logic
            if state == ProfileState.RESTING.value:
                rest_until = state_data.get('rest_until', 0)
@ -1257,6 +1364,60 @@ class PolicyEnforcer:
            return True # Indicates action was taken
        return False
    def enforce_proxy_state_on_profiles(self, all_profiles_list, all_profiles_map):
        """
        Enforces the state of a proxy onto all profiles that use it.
        - If a proxy is BANNED, any non-banned profile using it will be banned.
        - If a proxy is RESTING, any ACTIVE profile using it will be rested.
        This is a safeguard that runs after all proxy state changes and before profile
        state logic.
        """
        unique_proxies = sorted(list(set(p['proxy'] for p in all_profiles_list if p.get('proxy'))))
        if not unique_proxies:
            return
        proxy_states = self.manager.get_proxy_states(unique_proxies)
        for profile in all_profiles_list:
            proxy_url = profile.get('proxy')
            if not proxy_url:
                continue
            proxy_state_data = proxy_states.get(proxy_url)
            if not proxy_state_data:
                continue
            proxy_state = proxy_state_data.get('state')
            profile_name = profile['name']
            if proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value:
                reason = f"Proxy '{proxy_url}' is BANNED"
                logger.warning(f"Banning profile '{profile_name}' because its proxy is banned: {reason}")
                self.actions_taken_this_cycle += 1
                if not self.dry_run:
                    sm = self.manager.get_state_machine(profile_name)
                    if sm:
                        sm.ban(reason=reason)
                # Update local map for consistency in this cycle
                all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value
                all_profiles_map[profile_name]['reason'] = reason
            elif proxy_state == ProfileState.RESTING.value and profile['state'] == ProfileState.ACTIVE.value:
                logger.info(f"Resting profile '{profile_name}' because its proxy '{proxy_url}' is resting.")
                self.actions_taken_this_cycle += 1
                if not self.dry_run:
                    # Rest it for as long as the proxy is resting.
                    proxy_rest_until = proxy_state_data.get('rest_until', 0)
                    duration_minutes = max(0, (proxy_rest_until - time.time()) / 60)
                    sm = self.manager.get_state_machine(profile_name)
                    if sm:
                        sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=duration_minutes)
                # Update local map for consistency in this cycle
                proxy_rest_until = proxy_state_data.get('rest_until', 0)
                all_profiles_map[profile_name]['state'] = ProfileState.RESTING.value
                all_profiles_map[profile_name]['rest_reason'] = self.PROXY_REST_REASON
                all_profiles_map[profile_name]['rest_until'] = proxy_rest_until
 def add_policy_enforcer_parser(subparsers):
    """Adds the parser for the 'policy-enforcer' command."""
    parser = subparsers.add_parser(
@ -1421,6 +1582,10 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
    logger.debug("Syncing active profiles from Auth to Download simulation...")
    # Get all download proxy states once for efficiency
    all_dl_proxies = sorted(list(set(p['proxy'] for p in all_download_profiles.values() if p.get('proxy'))))
    all_dl_proxy_states = download_manager.get_proxy_states(all_dl_proxies)
    # Get profiles that should be active in the download simulation
    target_active_download_profiles = set()
@ -1465,6 +1630,13 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
            logger.warning(f"Auth profile '{target_profile_name}' needs an active download profile, but no corresponding download profile found.")
            continue
        # Check proxy state before activating
        proxy_url = download_profile.get('proxy')
        proxy_state = all_dl_proxy_states.get(proxy_url, {}).get('state')
        if proxy_state in [ProfileState.BANNED.value, ProfileState.RESTING.value]:
            logger.debug(f"Sync: Deferring activation of download profile '{target_profile_name}' because its proxy '{proxy_url}' is {proxy_state}.")
            continue
        if download_profile['state'] not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
            is_from_cooldown = download_profile['state'] == ProfileState.COOLDOWN.value
            log_msg_suffix = " (from COOLDOWN)" if is_from_cooldown else ""
@ -1476,24 +1648,12 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
                    sm.activate(profile=download_profile)
    # --- Group-Aware Deactivation ---
-    # Identify the target download groups based on target_active_download_profiles.
+    # Deactivate any download profiles that are active but are not the target profile for their group.
-    # CRITICAL FIX: Directly map individual profiles to their groups instead of relying on name patterns.
+    # This ensures that if auth wants 'user1_1' to be active, the currently active 'user1_0' is rested first.
    target_download_groups = set()
    for target_profile_name in target_active_download_profiles:
        group_info = dl_profile_to_group.get(target_profile_name)
        if group_info:
            target_download_groups.add(group_info['name'])
    logger.debug(f"Target download groups for this sync cycle: {target_download_groups}")
    # Deactivate any download profiles that are active but are not in a target group
    for dl_profile_name, dl_profile in all_download_profiles.items():
        if dl_profile['state'] == ProfileState.ACTIVE.value:
-            group_info = dl_profile_to_group.get(dl_profile_name)
+            if dl_profile_name not in target_active_download_profiles:
-            # If the profile is in a group, and that group is NOT a target group, rest it.
+                logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as it is no longer the target active profile for its group.")
            if group_info and group_info['name'] not in target_download_groups:
                logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as its group '{group_info['name']}' is no longer active.")
                if not dry_run:
                    sm = download_manager.get_state_machine(dl_profile_name)
                    if sm:
@ -1539,9 +1699,9 @@ def main_policy_enforcer(args):
        'unlock_stale_locks_after_seconds': 120,
        'unlock_cooldown_seconds': 0,
        'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10,
-        'profile_selection_strategy': 'longest_idle',
+        'profile_selection_strategy': None,
        'global_max_active_profiles': 0,
-        'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': []
+        'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [], 'enforcement_pools': []
    }
    sim_params = policy.get('simulation_parameters', {})
@ -1586,10 +1746,90 @@ def main_policy_enforcer(args):
        logger.info(f"Setting up enforcer for {sim_type} simulation...")
-        # --- Dynamic Profile Group Discovery ---
+        # --- Hybrid Profile Group Discovery (Static + Dynamic) ---
-        profile_group_templates = policy.get('profile_group_templates')
+        common_group_settings = policy.get('common_group_settings', {})
-        # Check if templates exist and if the config block doesn't already have groups (CLI overrides take precedence)
+        enforcement_pools = policy.get('enforcement_pools')
-        if profile_group_templates and 'profile_groups' not in policy_config:
+        profile_group_templates = policy.get('profile_group_templates') # For backward compatibility
        # Start with any statically defined groups from the policy.
        final_profile_groups = policy_config.get('profile_groups', [])
        # If enforcement_pools are defined, discover dynamic groups and merge them.
        if enforcement_pools:
            logger.info(f"Found 'enforcement_pools'. Discovering dynamic profile groups to merge with static ones for {sim_type}...")
            # Determine key_prefix to connect to the right Redis env
            policy_env = sim_params.get(env_policy_key)
            default_policy_env = sim_params.get('env')
            effective_env = env_cli_arg or args.env or policy_env or default_policy_env or 'dev'
            if args.key_prefix: temp_key_prefix = args.key_prefix
            elif args.legacy: temp_key_prefix = 'profile_mgmt_'
            else: temp_key_prefix = f"{effective_env}_profile_mgmt_"
            try:
                temp_manager = ProfileManager(redis_host, redis_port, redis_password, temp_key_prefix, redis_db)
                all_profiles = temp_manager.list_profiles()
                found_prefixes = set(p['name'].rsplit('_', 1)[0] for p in all_profiles)
                if not found_prefixes:
                    logger.warning(f"Dynamic discovery found no profile prefixes for env '{effective_env}'.")
                else:
                    logger.info(f"Discovered {len(found_prefixes)} unique profile prefixes: {sorted(list(found_prefixes))}")
                    dynamically_generated_groups = []
                    # Match discovered prefixes against patterns in each enforcement pool
                    for i, pool in enumerate(enforcement_pools):
                        pool_name = pool.get('name', f'pool_{i}')
                        pool_patterns = pool.get('profile_group_patterns', [])
                        # If a pool has no patterns, it's just for defining concurrency for static groups. Skip discovery.
                        if not pool_patterns:
                            continue
                        # Merge common settings with any pool-specific overrides
                        group_settings_template = deepcopy(common_group_settings)
                        pool_specific_settings = pool.get('group_settings', {})
                        # A simple way to deep merge the two levels (auth/download)
                        auth_settings = group_settings_template.get('auth', {})
                        auth_settings.update(pool_specific_settings.get('auth', {}))
                        group_settings_template['auth'] = auth_settings
                        download_settings = group_settings_template.get('download', {})
                        download_settings.update(pool_specific_settings.get('download', {}))
                        group_settings_template['download'] = download_settings
                        for prefix in sorted(list(found_prefixes)):
                            for pattern in pool_patterns:
                                if fnmatch.fnmatch(prefix, pattern):
                                    sim_settings = group_settings_template.get(sim_type.lower())
                                    if not sim_settings:
                                        logger.debug(f"Pool '{pool_name}' has no settings for '{sim_type}'. Skipping for prefix '{prefix}'.")
                                        continue
                                    new_group = deepcopy(sim_settings)
                                    new_group['prefix'] = prefix
                                    new_group['name'] = prefix
                                    new_group['pool_name'] = pool_name
                                    dynamically_generated_groups.append(new_group)
                                    logger.debug(f"Assigned prefix '{prefix}' to pool '{pool_name}' for {sim_type} simulation.")
                                    break
                    if dynamically_generated_groups:
                        logger.info(f"Merging {len(final_profile_groups)} static group(s) with {len(dynamically_generated_groups)} discovered dynamic group(s).")
                        final_profile_groups.extend(dynamically_generated_groups)
            except Exception as e:
                logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose)
            # Update the policy_config with the final merged list and the pool definitions
            policy_config['profile_groups'] = final_profile_groups
            # CRITICAL: Deepcopy enforcement_pools to prevent modification in one simulation
            # from affecting the other, since the policy object is shared.
            policy_config['enforcement_pools'] = deepcopy(enforcement_pools)
        # For backward compatibility with the old template format
        elif profile_group_templates and 'profile_groups' not in policy_config:
            logger.info(f"Found 'profile_group_templates'. Discovering profile groups dynamically for {sim_type}...")
            # Determine key_prefix to connect to the right Redis env (logic duplicated from below)
@ -1640,6 +1880,21 @@ def main_policy_enforcer(args):
            except Exception as e:
                logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose)
        # In the download simulation, the active profiles are dictated entirely by the
        # cross-simulation sync logic. We must disable the download enforcer's own
        # concurrency limits (max_active_profiles) to prevent it from "healing"
        # profiles that the sync logic has correctly activated.
        if sim_type == 'Download':
            logger.info("Disabling max_active_profiles limits for Download simulation. Active profiles will be managed by cross-sim sync.")
            if 'profile_groups' in policy_config:
                for group in policy_config['profile_groups']:
                    group['max_active_profiles'] = 0
            if 'enforcement_pools' in policy_config:
                for pool in policy_config['enforcement_pools']:
                    pool['max_active_profiles'] = 0
            # Also disable the global limit for the download simulation.
            policy_config['global_max_active_profiles'] = 0
        config = Config(args, policy_config, code_defaults)
        # Determine the effective environment name with correct precedence:
--- a/ytops_client-source/ytops_client/profile_manager_tool.py
+++ b/ytops_client-source/ytops_client/profile_manager_tool.py
@ -18,8 +18,10 @@ import time
 from datetime import datetime
 from typing import Dict, List, Optional, Any
 import collections
 import fnmatch
 import redis
 import yaml
 from .profile_statemachine import ProfileState, ProfileStateMachine
@ -199,8 +201,8 @@ class ProfileManager:
        # When decrementing, ensure the counter exists to avoid creating negative counters from stray calls.
        if count < 0 and not self.redis.exists(key):
-            logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. No action taken.")
+            logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.")
-            return None
+            return 0
        new_value = self.redis.incrby(key, count)
@ -225,8 +227,8 @@ class ProfileManager:
        # Only decrement if the key exists. This prevents stray calls from creating negative counters.
        if not self.redis.exists(key):
-            logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. No action taken.")
+            logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.")
-            return None
+            return 0
        new_value = self.redis.decr(key)
@ -444,7 +446,12 @@ class ProfileManager:
            logger.error(f"Invalid state: {new_state}")
            return False
-        sm = self.get_state_machine(name)
+        profile = self.get_profile(name)
        if not profile:
            # get_profile already logs an error if the profile is not found.
            return False
        sm = self.get_state_machine(name, profile=profile)
        if not sm:
            return False # get_state_machine logs the error
@ -453,14 +460,16 @@ class ProfileManager:
            return True
        try:
            # Pass the profile object to the transition methods for context,
            # which is consistent with the policy enforcer's usage.
            if new_state == ProfileState.ACTIVE.value:
-                sm.activate()
+                sm.activate(profile=profile)
            elif new_state == ProfileState.BANNED.value:
-                sm.ban(reason=reason)
+                sm.ban(reason=reason, profile=profile)
            elif new_state == ProfileState.RESTING.value:
-                sm.rest(reason=reason)
+                sm.rest(reason=reason, profile=profile)
            elif new_state == ProfileState.PAUSED.value:
-                sm.pause(reason=reason)
+                sm.pause(reason=reason, profile=profile)
            # LOCKED and COOLDOWN are not handled here as they are special transitions
            # from lock_profile and unlock_profile, and should not be set directly.
            elif new_state in [ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]:
@ -720,30 +729,51 @@ class ProfileManager:
        logger.info(f"Deleted {deleted_count} global counter key(s).")
        return deleted_count
-    def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None) -> bool:
+    def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None, reason: Optional[str] = None) -> bool:
        """Set the state of a proxy and propagates it to associated profiles."""
-        if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value]:
+        if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value, ProfileState.BANNED.value]:
-            logger.error(f"Invalid proxy state: {state}. Only ACTIVE and RESTING are supported for proxies.")
+            logger.error(f"Invalid proxy state: {state}. Only ACTIVE, RESTING, and BANNED are supported for proxies.")
            return False
        proxy_key = self._proxy_state_key(proxy_url)
        now = time.time()
        updates = {'state': state}
        if reason:
            updates['reason'] = reason
        else:
            # Clear reason if not provided
            updates['reason'] = ''
        rest_until = 0
        if state == ProfileState.RESTING.value:
-            if not rest_duration_minutes or rest_duration_minutes <= 0:
+            if rest_duration_minutes is None:
                logger.error("rest_duration_minutes is required when setting proxy state to RESTING.")
                return False
            if rest_duration_minutes == -1:
                # Use a very large number for "indefinite" to avoid special cases later.
                # 10 years should be sufficient.
                rest_until = now + (10 * 365 * 24 * 60 * 60)
            elif rest_duration_minutes > 0:
                rest_until = now + rest_duration_minutes * 60
            else:
                logger.error("rest_duration_minutes must be positive, or -1 for indefinite.")
                return False
            updates['rest_until'] = str(rest_until)
            updates['work_start_timestamp'] = '0' # Clear work start time
-        else: # ACTIVE
+        elif state in [ProfileState.ACTIVE.value, ProfileState.BANNED.value]:
            updates['rest_until'] = '0'
            if state == ProfileState.ACTIVE.value:
                updates['work_start_timestamp'] = str(now)
            else: # BANNED
                updates['work_start_timestamp'] = '0'
        self.redis.hset(proxy_key, mapping=updates)
-        logger.info(f"Set proxy '{proxy_url}' state to {state}.")
+        log_msg = f"Set proxy '{proxy_url}' state to {state}."
        if reason:
            log_msg += f" Reason: {reason}"
        logger.info(log_msg)
        # Now, update associated profiles
        profiles_on_proxy = self.list_profiles(proxy_filter=proxy_url)
@ -751,16 +781,31 @@ class ProfileManager:
            return True
        if state == ProfileState.RESTING.value:
-            logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'.")
+            propagate_reason = reason or "Proxy resting"
            logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
            for profile in profiles_on_proxy:
                if profile['state'] == ProfileState.ACTIVE.value:
-                    self.update_profile_state(profile['name'], ProfileState.RESTING.value, "Proxy resting")
+                    self.update_profile_state(profile['name'], ProfileState.RESTING.value, propagate_reason)
                    self.update_profile_field(profile['name'], 'rest_until', str(rest_until))
-        elif state == ProfileState.ACTIVE.value:
+        elif state == ProfileState.BANNED.value:
-            logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'.")
+            propagate_reason = reason or "Proxy banned"
            logger.info(f"Propagating BANNED state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
            for profile in profiles_on_proxy:
-                if profile['state'] == ProfileState.RESTING.value and profile.get('rest_reason') == "Proxy resting":
+                if profile['state'] != ProfileState.BANNED.value:
-                    self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, "Proxy activated")
+                    self.update_profile_state(profile['name'], ProfileState.BANNED.value, propagate_reason)
        elif state == ProfileState.ACTIVE.value:
            propagate_reason = reason or "Proxy activated"
            logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
            for profile in profiles_on_proxy:
                # Check for proxy-related reasons in both rest_reason and ban_reason
                proxy_related_reason = False
                if profile.get('rest_reason', '').startswith("Proxy "):
                    proxy_related_reason = True
                if profile.get('ban_reason', '').startswith("Proxy "):
                    proxy_related_reason = True
                if (profile['state'] in [ProfileState.RESTING.value, ProfileState.BANNED.value]) and proxy_related_reason:
                    self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, propagate_reason)
        return True
@ -1162,8 +1207,8 @@ def add_profile_manager_parser(subparsers):
    # List command
    list_parser = subparsers.add_parser('list', help='List profiles', parents=[common_parser])
-    list_parser.add_argument('--auth-env', help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view.')
+    list_parser.add_argument('--auth-env', default=None, help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view. Defaults to YTOPS_AUTH_ENV env var.')
-    list_parser.add_argument('--download-env', help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view.')
+    list_parser.add_argument('--download-env', default=None, help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view. Defaults to YTOPS_DOWNLOAD_ENV env var.')
    list_parser.add_argument('--separate-views', action='store_true', help='In dual-monitor mode, show two separate reports instead of a single merged view.')
    list_parser.add_argument('--rest-after-requests', type=int, help='(For display) Show countdown to rest based on this request limit.')
    list_parser.add_argument('--state', help='Filter by state')
@ -1177,20 +1222,24 @@ def add_profile_manager_parser(subparsers):
    list_parser.add_argument('--no-blink', action='store_true', help='Use ANSI escape codes for smoother screen updates in --live mode (experimental).')
    list_parser.add_argument('--interval-seconds', type=int, default=5, help='When in --live mode, how often to refresh in seconds. Default: 5.')
    list_parser.add_argument('--hide-active-state', action='store_true', help="Display 'ACTIVE' state as blank for cleaner UI.")
    list_parser.add_argument('--hide-ungrouped', action='store_true', help="Hide profiles that do not belong to any configured profile group (e.g., old profiles after a config change). Shown by default.")
    # Get command
    get_parser = subparsers.add_parser('get', help='Get profile details', parents=[common_parser])
    get_parser.add_argument('name', help='Profile name')
    # Set proxy state command
-    set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy and propagate to its profiles.', parents=[common_parser])
+    set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy (or proxies) and propagate to its profiles.', parents=[common_parser])
-    set_proxy_state_parser.add_argument('proxy_url', help='Proxy URL')
+    set_proxy_state_parser.add_argument('proxy_urls', help='Proxy URL, or comma-separated list of URLs')
-    set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING'], help='New state for the proxy')
+    set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING', 'BANNED'], help='New state for the proxy')
-    set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state')
+    set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state. Use -1 for indefinite rest.')
    set_proxy_state_parser.add_argument('--reason', help='Reason for the state change. Propagated to profiles.')
    set_proxy_state_parser.add_argument('--auth-env', default=None, help='Target the Auth simulation environment. Can be used with --download-env. Defaults to YTOPS_AUTH_ENV env var.')
    set_proxy_state_parser.add_argument('--download-env', default=None, help='Target the Download simulation environment. Can be used with --auth-env. Defaults to YTOPS_DOWNLOAD_ENV env var.')
    # Update state command
-    update_state_parser = subparsers.add_parser('update-state', help='Update profile state', parents=[common_parser])
+    update_state_parser = subparsers.add_parser('update-state', help='Update profile state for one or more profiles.', parents=[common_parser])
-    update_state_parser.add_argument('name', help='Profile name')
+    update_state_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
    update_state_parser.add_argument('state', choices=ProfileState.values(),
                                    help='New state')
    update_state_parser.add_argument('--reason', help='Reason for state change (especially for BAN)')
@ -1202,21 +1251,21 @@ def add_profile_manager_parser(subparsers):
    update_field_parser.add_argument('value', help='New value')
    # Pause command (convenience)
-    pause_parser = subparsers.add_parser('pause', help=f'Pause a profile (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser])
+    pause_parser = subparsers.add_parser('pause', help=f'Pause one or more profiles (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser])
-    pause_parser.add_argument('name', help='Profile name')
+    pause_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
    # Activate command (convenience)
-    activate_parser = subparsers.add_parser('activate', help=f'Activate a profile (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser])
+    activate_parser = subparsers.add_parser('activate', help=f'Activate one or more profiles (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser])
-    activate_parser.add_argument('name', help='Profile name')
+    activate_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
    # Ban command (convenience)
-    ban_parser = subparsers.add_parser('ban', help=f'Ban a profile (sets state to {ProfileState.BANNED.value}).', parents=[common_parser])
+    ban_parser = subparsers.add_parser('ban', help=f'Ban one or more profiles (sets state to {ProfileState.BANNED.value}).', parents=[common_parser])
-    ban_parser.add_argument('name', help='Profile name')
+    ban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
    ban_parser.add_argument('--reason', required=True, help='Reason for ban')
    # Unban command (convenience)
-    unban_parser = subparsers.add_parser('unban', help=f'Unban a profile (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser])
+    unban_parser = subparsers.add_parser('unban', help=f'Unban one or more profiles (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser])
-    unban_parser.add_argument('name', help='Profile name')
+    unban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
    # Delete command
    delete_parser = subparsers.add_parser('delete', help='Delete a profile', parents=[common_parser])
@ -1228,6 +1277,15 @@ def add_profile_manager_parser(subparsers):
    delete_all_parser = subparsers.add_parser('delete-all', help='(Destructive) Delete all profiles and data under the current key prefix.', parents=[common_parser])
    delete_all_parser.add_argument('--confirm', action='store_true', help='Confirm this highly destructive action (required)')
    # Cleanup ungrouped command
    cleanup_parser = subparsers.add_parser('cleanup-ungrouped', help='(Safe) Sync profiles in Redis with the setup policy (create missing, delete extra).', parents=[common_parser])
    cleanup_parser.add_argument('--policy-file', required=True, help='Path to the profile setup policy YAML file (e.g., policies/6_profile_setup_policy.yaml).')
    cleanup_parser.add_argument('--auth-env', default=None, help="Environment name for the Auth simulation to clean. Defaults to YTOPS_AUTH_ENV env var.")
    cleanup_parser.add_argument('--download-env', default=None, help="Environment name for the Download simulation to clean. Defaults to YTOPS_DOWNLOAD_ENV env var.")
    cleanup_parser.add_argument('--dry-run', action='store_true', help="Only show which profiles would be created or deleted, don't actually change them.")
    cleanup_parser.add_argument('--no-create-missing', action='store_true', help="Only delete ungrouped profiles, do not create profiles missing from Redis.")
    cleanup_parser.add_argument('--disallow-cleanup-active-downloads', action='store_true', help="In paired auth/download cleanup, PREVENTS deleting a profile if its download side is ACTIVE, even if the auth side is idle. Cleanup of active downloads is allowed by default.")
    # Reset global counters command
    reset_global_parser = subparsers.add_parser('reset-global-counters', help='Reset global counters (e.g., failed_lock_attempts).', parents=[common_parser])
@ -1675,6 +1733,19 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
        return
    profiles = manager.list_profiles(args.state, args.proxy)
    if getattr(args, 'hide_ungrouped', False):
        all_grouped_profile_names = set()
        for group in profile_groups_config:
            for p_name in group.get('profiles_in_group', []):
                all_grouped_profile_names.add(p_name)
        original_count = len(profiles)
        profiles = [p for p in profiles if p.get('name') in all_grouped_profile_names]
        filtered_count = original_count - len(profiles)
        if filtered_count > 0 and not args.live:
            print(f"NOTE: {filtered_count} ungrouped profiles were hidden via --hide-ungrouped.", file=sys.stderr)
    if not profiles:
        print("No profiles found matching the criteria.", file=file)
        return
@ -1832,10 +1903,13 @@ def _render_simulation_view(title, manager, args, file=sys.stdout):
    profile_groups_config = _build_profile_groups_config(manager, profiles)
    # The group summary table is only relevant for the Auth simulation, which has
    # selection strategies and rotation policies.
    is_auth_sim = 'Auth' in title
    if is_auth_sim:
        profile_selection_strategy = manager.get_config('profile_selection_strategy')
        if profile_selection_strategy:
            print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file)
        _render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file)
    failed_lock_attempts = manager.get_failed_lock_attempts()
@ -1997,7 +2071,6 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
        _render_activation_history_table(auth_manager, file=file)
    print(f"\n--- Download Simulation Profile Details ({args.download_env}) ---", file=file)
    _render_profile_group_summary_table(download_manager, dl_profiles, dl_groups_config, args, file=file)
    _render_profile_details_table(download_manager, args, "Download", dl_groups_config, file=file)
    if args.show_activation_history:
        _render_activation_history_table(download_manager, file=file)
@ -2010,6 +2083,8 @@ def _print_profile_list(manager, args, title="Profile Status"):
    return _render_simulation_view(title, manager, args, file=sys.stdout)
 def main_profile_manager(args):
    """Main dispatcher for 'profile' command."""
    if load_dotenv:
@ -2026,6 +2101,13 @@ def main_profile_manager(args):
            print(f"ERROR: The specified --env-file was not found: {args.env_file}", file=sys.stderr)
            return 1
    # After loading .env, populate any args that were not provided on the CLI
    # This is necessary because argparse `default=os.getenv(...)` runs before `load_dotenv`.
    if hasattr(args, 'auth_env') and args.auth_env is None:
        args.auth_env = os.getenv('YTOPS_AUTH_ENV')
    if hasattr(args, 'download_env') and args.download_env is None:
        args.download_env = os.getenv('YTOPS_DOWNLOAD_ENV')
    if args.redis_host is None:
        args.redis_host = os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost'))
    if args.redis_port is None:
@ -2190,8 +2272,46 @@ def main_profile_manager(args):
                sys.stdout.flush()
    elif args.profile_command == 'set-proxy-state':
-        success = manager.set_proxy_state(args.proxy_url, args.state, args.duration_minutes)
+        proxy_urls = [p.strip() for p in args.proxy_urls.split(',') if p.strip()]
-        return 0 if success else 1
+        if not proxy_urls:
            print("Error: No proxy URLs provided.", file=sys.stderr)
            return 1
        envs_to_process = []
        if args.auth_env:
            envs_to_process.append(args.auth_env)
        if args.download_env:
            envs_to_process.append(args.download_env)
        if envs_to_process:
            # If --auth-env or --download-env are used, operate on them.
            all_success = True
            for env_name in set(envs_to_process):
                # When operating on specific envs, derive prefix from env name, ignoring --legacy and --key-prefix.
                # This aligns with the behavior of the 'list' command in dual-mode.
                key_prefix_for_env = f"{env_name}_profile_mgmt_"
                print(f"--- Setting proxy state for environment: {env_name} (prefix: {key_prefix_for_env}) ---", file=sys.stderr)
                env_manager = ProfileManager(
                    redis_host=args.redis_host,
                    redis_port=args.redis_port,
                    redis_password=args.redis_password,
                    key_prefix=key_prefix_for_env,
                    redis_db=args.redis_db
                )
                for proxy_url in proxy_urls:
                    success = env_manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason)
                    if not success:
                        all_success = False
            return 0 if all_success else 1
        else:
            # Fallback to the single manager created with --env, --legacy, or --key-prefix.
            # This maintains backward compatibility and handles single-environment cases.
            all_success = True
            for proxy_url in proxy_urls:
                success = manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason)
                if not success:
                    all_success = False
            return 0 if all_success else 1
    elif args.profile_command == 'get':
        profile = manager.get_profile(args.name)
@ -2226,31 +2346,211 @@ def main_profile_manager(args):
        return 0
    elif args.profile_command == 'update-state':
-        success = manager.update_profile_state(args.name, args.state, args.reason or '')
+        names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
-        return 0 if success else 1
+        if not names_or_patterns:
            print("Error: No profile names or patterns provided.", file=sys.stderr)
            return 1
        all_profiles = manager.list_profiles()
        all_profile_names = {p['name'] for p in all_profiles}
        profiles_to_update = set()
        for item in names_or_patterns:
            if '*' in item or '?' in item:
                matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
                if not matched_profiles:
                    print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
                profiles_to_update.update(matched_profiles)
            else:
                if item in all_profile_names:
                    profiles_to_update.add(item)
                else:
                    print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
        if not profiles_to_update:
            print("No matching profiles found to update.", file=sys.stderr)
            return 1
        print(f"The following {len(profiles_to_update)} profiles will be updated to state '{args.state}':")
        print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
        confirm = input("Are you sure you want to proceed? (y/N): ")
        if confirm.lower() != 'y':
            print("Aborted.")
            return 1
        all_success = True
        for name in sorted(list(profiles_to_update), key=natural_sort_key):
            success = manager.update_profile_state(name, args.state, args.reason or '')
            if not success:
                all_success = False
        return 0 if all_success else 1
    elif args.profile_command == 'update-field':
        success = manager.update_profile_field(args.name, args.field, args.value)
        return 0 if success else 1
    elif args.profile_command == 'pause':
-        success = manager.update_profile_state(args.name, ProfileState.PAUSED.value, 'Manual pause')
+        names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
-        return 0 if success else 1
+        if not names_or_patterns:
            print("Error: No profile names or patterns provided.", file=sys.stderr)
            return 1
        all_profiles = manager.list_profiles()
        all_profile_names = {p['name'] for p in all_profiles}
        profiles_to_update = set()
        for item in names_or_patterns:
            if '*' in item or '?' in item:
                matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
                if not matched_profiles:
                    print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
                profiles_to_update.update(matched_profiles)
            else:
                if item in all_profile_names:
                    profiles_to_update.add(item)
                else:
                    print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
        if not profiles_to_update:
            print("No matching profiles found to update.", file=sys.stderr)
            return 1
        print(f"The following {len(profiles_to_update)} profiles will be PAUSED:")
        print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
        confirm = input("Are you sure you want to proceed? (y/N): ")
        if confirm.lower() != 'y':
            print("Aborted.")
            return 1
        all_success = True
        for name in sorted(list(profiles_to_update), key=natural_sort_key):
            success = manager.update_profile_state(name, ProfileState.PAUSED.value, 'Manual pause')
            if not success:
                all_success = False
        return 0 if all_success else 1
    elif args.profile_command == 'activate':
-        success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual activation')
+        names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
-        return 0 if success else 1
+        if not names_or_patterns:
            print("Error: No profile names or patterns provided.", file=sys.stderr)
            return 1
        all_profiles = manager.list_profiles()
        all_profile_names = {p['name'] for p in all_profiles}
        profiles_to_update = set()
        for item in names_or_patterns:
            if '*' in item or '?' in item:
                matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
                if not matched_profiles:
                    print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
                profiles_to_update.update(matched_profiles)
            else:
                if item in all_profile_names:
                    profiles_to_update.add(item)
                else:
                    print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
        if not profiles_to_update:
            print("No matching profiles found to update.", file=sys.stderr)
            return 1
        print(f"The following {len(profiles_to_update)} profiles will be ACTIVATED:")
        print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
        confirm = input("Are you sure you want to proceed? (y/N): ")
        if confirm.lower() != 'y':
            print("Aborted.")
            return 1
        all_success = True
        for name in sorted(list(profiles_to_update), key=natural_sort_key):
            success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual activation')
            if not success:
                all_success = False
        return 0 if all_success else 1
    elif args.profile_command == 'ban':
-        success = manager.update_profile_state(args.name, ProfileState.BANNED.value, args.reason)
+        names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
-        return 0 if success else 1
+        if not names_or_patterns:
            print("Error: No profile names or patterns provided.", file=sys.stderr)
            return 1
        all_profiles = manager.list_profiles()
        all_profile_names = {p['name'] for p in all_profiles}
        profiles_to_update = set()
        for item in names_or_patterns:
            if '*' in item or '?' in item:
                matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
                if not matched_profiles:
                    print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
                profiles_to_update.update(matched_profiles)
            else:
                if item in all_profile_names:
                    profiles_to_update.add(item)
                else:
                    print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
        if not profiles_to_update:
            print("No matching profiles found to update.", file=sys.stderr)
            return 1
        print(f"The following {len(profiles_to_update)} profiles will be BANNED:")
        print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
        confirm = input("Are you sure you want to proceed? (y/N): ")
        if confirm.lower() != 'y':
            print("Aborted.")
            return 1
        all_success = True
        for name in sorted(list(profiles_to_update), key=natural_sort_key):
            success = manager.update_profile_state(name, ProfileState.BANNED.value, args.reason)
            if not success:
                all_success = False
        return 0 if all_success else 1
    elif args.profile_command == 'unban':
        names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
        if not names_or_patterns:
            print("Error: No profile names or patterns provided.", file=sys.stderr)
            return 1
        all_profiles = manager.list_profiles()
        all_profile_names = {p['name'] for p in all_profiles}
        profiles_to_update = set()
        for item in names_or_patterns:
            if '*' in item or '?' in item:
                matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
                if not matched_profiles:
                    print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
                profiles_to_update.update(matched_profiles)
            else:
                if item in all_profile_names:
                    profiles_to_update.add(item)
                else:
                    print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
        if not profiles_to_update:
            print("No matching profiles found to update.", file=sys.stderr)
            return 1
        print(f"The following {len(profiles_to_update)} profiles will be UNBANNED:")
        print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
        confirm = input("Are you sure you want to proceed? (y/N): ")
        if confirm.lower() != 'y':
            print("Aborted.")
            return 1
        all_success = True
        for name in sorted(list(profiles_to_update), key=natural_sort_key):
            # First activate, then reset session counters. The ban reason is cleared by update_profile_state.
-        success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual unban')
+            success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual unban')
            if success:
-            manager.reset_profile_counters(args.name)
+                manager.reset_profile_counters(name)
-        return 0 if success else 1
+            if not success:
                all_success = False
        return 0 if all_success else 1
    elif args.profile_command == 'delete':
        if not args.confirm:
@ -2267,6 +2567,9 @@ def main_profile_manager(args):
        print(f"Deleted {deleted_count} key(s) with prefix '{manager.key_prefix}'.")
        return 0
    elif args.profile_command == 'cleanup-ungrouped':
        return _main_cleanup_ungrouped(args)
    elif args.profile_command == 'reset-global-counters':
        manager.reset_global_counters()
        return 0
@ -2310,3 +2613,286 @@ def main_profile_manager(args):
        return 0
    return 1 # Should not be reached
 def _main_cleanup_ungrouped(args):
    """Handler for the 'cleanup-ungrouped' command."""
    try:
        with open(args.policy_file, 'r', encoding='utf-8') as f:
            policy = yaml.safe_load(f)
    except (IOError, yaml.YAMLError) as e:
        print(f"Error: Could not read or parse policy file '{args.policy_file}': {e}", file=sys.stderr)
        return 1
    common_pools = policy.get('common_pools', [])
    if not common_pools:
        print(f"Warning: No 'common_pools' found in '{args.policy_file}'. Nothing to do.", file=sys.stderr)
        return 0
    desired_profiles = {}  # name -> proxy
    for pool in common_pools:
        prefixes = pool.get('prefixes', [])
        count = pool.get('count', 0)
        proxy = pool.get('proxy')
        if not proxy:
            print(f"Error: Pool with prefixes {prefixes} is missing a 'proxy' definition.", file=sys.stderr)
            return 1
        for prefix in prefixes:
            for i in range(count):
                name = f"{prefix}_{i}"
                desired_profiles[name] = proxy
    desired_profile_names = set(desired_profiles.keys())
    print(f"Loaded setup policy. Found {len(desired_profile_names)} desired profiles across {len(common_pools)} pools.")
    total_deleted = 0
    total_skipped = 0
    total_created = 0
    total_updated = 0
    # --- Paired Cleanup Mode ---
    if args.auth_env and args.download_env:
        print("\n--- Running in Paired Cleanup Mode ---")
        if not args.disallow_cleanup_active_downloads:
            print("Cleanup of active download profiles is ENABLED (default). Auth profiles will be checked for idleness.")
            print("If an auth profile is idle, its corresponding download profile will be removed regardless of its state.")
        else:
            print("Cleanup of active download profiles is DISABLED. Both auth and download profiles must be idle to be removed.")
        if args.dry_run:
            print("--- DRY RUN MODE: No changes will be made. ---")
        def _create_cleanup_manager(env_name):
            key_prefix = f"{env_name}_profile_mgmt_"
            if args.legacy: key_prefix = 'profile_mgmt_'
            if args.key_prefix: key_prefix = args.key_prefix
            return ProfileManager(
                redis_host=args.redis_host, redis_port=args.redis_port,
                redis_password=args.redis_password, key_prefix=key_prefix,
                redis_db=args.redis_db
            )
        auth_manager = _create_cleanup_manager(args.auth_env)
        download_manager = _create_cleanup_manager(args.download_env)
        auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()}
        download_profiles_map = {p['name']: p for p in download_manager.list_profiles()}
        # --- Create Missing Profiles ---
        auth_missing = desired_profile_names - set(auth_profiles_map.keys())
        download_missing = desired_profile_names - set(download_profiles_map.keys())
        if auth_missing or download_missing:
            if args.no_create_missing:
                if auth_missing: print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}', creation disabled via --no-create-missing.")
                if download_missing: print(f"Found {len(download_missing)} missing profiles in '{args.download_env}', creation disabled via --no-create-missing.")
            else:
                if auth_missing:
                    print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}'. Creating them...")
                    for name in sorted(list(auth_missing), key=natural_sort_key):
                        proxy = desired_profiles.get(name)
                        print(f"  - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.auth_env}'")
                        if not args.dry_run:
                            if auth_manager.create_profile(name, proxy): total_created += 1
                if download_missing:
                    print(f"Found {len(download_missing)} missing profiles in '{args.download_env}'. Creating them...")
                    for name in sorted(list(download_missing), key=natural_sort_key):
                        proxy = desired_profiles.get(name)
                        print(f"  - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.download_env}'")
                        if not args.dry_run:
                            if download_manager.create_profile(name, proxy): total_created += 1
                # Refresh maps after creation
                auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()}
                download_profiles_map = {p['name']: p for p in download_manager.list_profiles()}
        # --- Update Proxies for Existing Profiles ---
        auth_existing_policy_names = set(auth_profiles_map.keys()) & desired_profile_names
        if auth_existing_policy_names:
            print(f"\nChecking for proxy updates in '{args.auth_env}'...")
            for name in sorted(list(auth_existing_policy_names), key=natural_sort_key):
                current_proxy = auth_profiles_map[name].get('proxy')
                desired_proxy = desired_profiles.get(name)
                if current_proxy != desired_proxy and desired_proxy:
                    print(f"  - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
                    if not args.dry_run:
                        if auth_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1
        download_existing_policy_names = set(download_profiles_map.keys()) & desired_profile_names
        if download_existing_policy_names:
            print(f"\nChecking for proxy updates in '{args.download_env}'...")
            for name in sorted(list(download_existing_policy_names), key=natural_sort_key):
                current_proxy = download_profiles_map[name].get('proxy')
                desired_proxy = desired_profiles.get(name)
                if current_proxy != desired_proxy and desired_proxy:
                    print(f"  - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
                    if not args.dry_run:
                        if download_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1
        # --- Delete Ungrouped Profiles ---
        auth_ungrouped = set(auth_profiles_map.keys()) - desired_profile_names
        download_ungrouped = set(download_profiles_map.keys()) - desired_profile_names
        all_ungrouped = sorted(list(auth_ungrouped | download_ungrouped), key=natural_sort_key)
        if not all_ungrouped:
            print("\nNo ungrouped profiles found in either environment to clean up.")
        else:
            print(f"\nFound {len(all_ungrouped)} ungrouped profile(s) across both environments to consider for deletion.")
            for name in all_ungrouped:
                auth_profile = auth_profiles_map.get(name)
                download_profile = download_profiles_map.get(name)
                can_delete_auth, auth_skip_reasons = False, []
                if auth_profile:
                    state = auth_profile.get('state', 'UNKNOWN')
                    pending_dls = auth_manager.get_pending_downloads(name)
                    rest_reason = auth_profile.get('rest_reason')
                    is_safe = True
                    if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
                        is_safe = False; auth_skip_reasons.append(f"auth is '{state}'")
                    if pending_dls > 0:
                        is_safe = False; auth_skip_reasons.append(f"auth has {pending_dls} pending DLs")
                    if rest_reason == 'waiting_downloads':
                        is_safe = False; auth_skip_reasons.append("auth is 'waiting_downloads'")
                    if is_safe: can_delete_auth = True
                can_delete_download, download_skip_reasons = False, []
                if download_profile:
                    state = download_profile.get('state', 'UNKNOWN')
                    is_safe = state not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]
                    if not is_safe: download_skip_reasons.append(f"download is '{state}'")
                    if is_safe: can_delete_download = True
                if auth_profile and not can_delete_auth:
                    print(f"  - SKIPPING '{name}' because its auth profile is busy ({', '.join(auth_skip_reasons)}).")
                    total_skipped += 1
                    continue
                was_download_busy = download_profile and not can_delete_download
                if auth_profile and can_delete_auth and not args.disallow_cleanup_active_downloads:
                    can_delete_download = True  # The override rule
                if (auth_profile and can_delete_auth) or (download_profile and can_delete_download):
                    msgs = []
                    if auth_profile and can_delete_auth: msgs.append(f"Auth (State: {auth_profile.get('state', 'N/A')})")
                    if download_profile and can_delete_download:
                        dl_msg = f"Download (State: {download_profile.get('state', 'N/A')})"
                        if was_download_busy: dl_msg += " <-- DELETING ACTIVE"
                        msgs.append(dl_msg)
                    print(f"  - Deleting '{name}': {'; '.join(msgs)}")
                    if not args.dry_run:
                        if auth_profile and can_delete_auth: auth_manager.delete_profile(name)
                        if download_profile and can_delete_download: download_manager.delete_profile(name)
                    total_deleted += 1
                else:
                    print(f"  - SKIPPING '{name}' because its download profile is busy ({', '.join(download_skip_reasons)}) and no idle auth profile exists to override.")
                    total_skipped += 1
    # --- Single Environment Cleanup Mode ---
    else:
        if args.dry_run:
            print("--- DRY RUN MODE: No changes will be made. ---")
        envs_to_clean = []
        if args.auth_env: envs_to_clean.append(args.auth_env)
        if args.download_env: envs_to_clean.append(args.download_env)
        if not envs_to_clean:
            if args.env: envs_to_clean.append(args.env)
            else:
                print("Error: You must specify at least one environment to clean up (e.g., --auth-env sim_auth, --download-env sim_download, or --env dev).", file=sys.stderr)
                return 1
        for env_name in envs_to_clean:
            print(f"\n--- Cleaning environment: {env_name} ---")
            key_prefix = f"{env_name}_profile_mgmt_"
            if args.legacy: key_prefix = 'profile_mgmt_'
            if args.key_prefix: key_prefix = args.key_prefix
            manager = ProfileManager(
                redis_host=args.redis_host, redis_port=args.redis_port,
                redis_password=args.redis_password, key_prefix=key_prefix,
                redis_db=args.redis_db
            )
            current_profiles_list = manager.list_profiles()
            current_profiles_map = {p['name']: p for p in current_profiles_list}
            current_profile_names = set(current_profiles_map.keys())
            ungrouped_names = current_profile_names - desired_profile_names
            missing_names = desired_profile_names - current_profile_names
            existing_policy_names = current_profile_names & desired_profile_names
            # --- Update Proxies for Existing Profiles ---
            profiles_updated_in_env = 0
            if existing_policy_names:
                print("Checking for proxy updates for existing profiles...")
                for name in sorted(list(existing_policy_names), key=natural_sort_key):
                    current_proxy = current_profiles_map[name].get('proxy')
                    desired_proxy = desired_profiles.get(name)
                    if current_proxy != desired_proxy and desired_proxy:
                        print(f"  - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
                        if not args.dry_run:
                            if manager.update_profile_field(name, 'proxy', desired_proxy):
                                profiles_updated_in_env += 1
                total_updated += profiles_updated_in_env
            profiles_created_in_env = 0
            if missing_names:
                print(f"Found {len(missing_names)} profile(s) defined in the policy that do not exist in Redis.")
                if args.no_create_missing:
                    print("Creation of missing profiles is disabled via --no-create-missing.")
                else:
                    if not args.dry_run:
                        print("Creating missing profiles...")
                    for name in sorted(list(missing_names), key=natural_sort_key):
                        proxy = desired_profiles.get(name)
                        print(f"  - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}'")
                        if not args.dry_run:
                            if proxy:
                                if manager.create_profile(name, proxy):
                                    profiles_created_in_env += 1
                            else:
                                # This should not happen due to the check at the start
                                print(f"  - SKIPPING '{name}' because its proxy could not be determined from the policy.", file=sys.stderr)
                    total_created += profiles_created_in_env
            if not ungrouped_names:
                print("No ungrouped profiles found to clean up.")
                if profiles_created_in_env == 0:
                    continue
            print(f"Found {len(ungrouped_names)} ungrouped profile(s) to consider for deletion.")
            profiles_to_check = [p for p in current_profiles_list if p['name'] in ungrouped_names]
            for profile in sorted(profiles_to_check, key=lambda p: natural_sort_key(p['name'])):
                name = profile['name']
                state = profile.get('state', 'UNKNOWN')
                pending_dls = manager.get_pending_downloads(name)
                is_safe, reasons = True, []
                if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
                    is_safe = False; reasons.append(f"is in '{state}' state")
                if pending_dls > 0:
                    is_safe = False; reasons.append(f"has {pending_dls} pending download(s)")
                if profile.get('rest_reason') == 'waiting_downloads':
                     is_safe = False; reasons.append("is in 'waiting_downloads' state")
                if is_safe:
                    print(f"  - Deleting '{name}' (State: {state}, Pending DLs: {pending_dls})")
                    if not args.dry_run: manager.delete_profile(name)
                    total_deleted += 1
                else:
                    print(f"  - SKIPPING '{name}' because it {', '.join(reasons)}.")
                    total_skipped += 1
    print("\n--- Cleanup Summary ---")
    print(f"Total profiles created: {total_created}")
    print(f"Total profiles updated: {total_updated}")
    print(f"Total profiles deleted: {total_deleted}")
    print(f"Total profiles skipped (still active or has pending work): {total_skipped}")
    if total_skipped > 0:
        print("Run the cleanup command again later to remove the skipped profiles once they are idle.")
    return 0
--- a/ytops_client-source/ytops_client/queue_manager_tool.py
+++ b/ytops_client-source/ytops_client/queue_manager_tool.py
@ -4,6 +4,7 @@ Redis Queue Management CLI Tool for yt-ops-client.
 """
 import argparse
 import hashlib
 import json
 import logging
 import os
@ -23,6 +24,15 @@ except ImportError:
    print("'tabulate' library not found. Please install it with: pip install tabulate", file=sys.stderr)
    tabulate = None
 try:
    import yaml
 except ImportError:
    print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
    yaml = None
 from pathlib import Path
 import fnmatch
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
@ -31,9 +41,53 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 def _find_configured_queues(policies_dir="policies", env=None):
    """Scans YAML files in a directory to find configured queue names."""
    if not yaml:
        return set()
    expected_queues = set()
    policies_path = Path(policies_dir)
    if not policies_path.is_dir():
        logger.debug(f"Policies directory '{policies_dir}' not found, cannot find expected queues.")
        return set()
    for policy_file in policies_path.glob("*.yaml"):
        try:
            with open(policy_file, 'r', encoding='utf-8') as f:
                policy_data = yaml.safe_load(f)
            if not isinstance(policy_data, dict):
                continue
            queue_policy = policy_data.get('queue_policy')
            if not isinstance(queue_policy, dict):
                continue
            use_prefix = queue_policy.get('use_env_prefix', True)
            prefix = ""
            if use_prefix and env:
                prefix = f"{env}_"
            for key, value in queue_policy.items():
                if key.endswith('_queue') and isinstance(value, str):
                    expected_queues.add(f"{prefix}{value}")
        except (IOError, yaml.YAMLError) as e:
            logger.debug(f"Could not parse policy {policy_file} to find queues: {e}")
            continue
    return expected_queues
 class QueueManager:
    """Manages Redis lists (queues)."""
    def _push_state_key(self, queue_name: str, file_path: str) -> str:
        """Get Redis key for storing the last pushed index for a given queue and file."""
        # Use a hash of the absolute file path to create a consistent, safe key.
        abs_path = os.path.abspath(file_path)
        path_hash = hashlib.sha256(abs_path.encode()).hexdigest()
        return f"ytops_client:queue_push_state:{queue_name}:{path_hash}"
    def __init__(self, redis_host='localhost', redis_port=6379, redis_password=None):
        """Initialize Redis connection."""
        logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port}...")
@ -70,10 +124,29 @@ class QueueManager:
        """Returns the number of items in a queue."""
        return self.redis.llen(queue_name)
-    def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int:
+    def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None, limit: Optional[int] = None, start_index: Optional[int] = None, auto_shift: bool = False) -> int:
        """Populates a queue from a file (text with one item per line, or JSON with an array of items)."""
        count = 0
        # --- State management for file position ---
        state_key = None
        current_start_index = 0  # 0-based index
        if auto_shift:
            state_key = self._push_state_key(queue_name, file_path)
            last_index_str = self.redis.get(state_key)
            if last_index_str:
                current_start_index = int(last_index_str)
            logger.info(f"Auto-shift enabled. Resuming from line {current_start_index + 1}.")
        elif start_index is not None:
            # CLI provides 1-based index, convert to 0-based.
            current_start_index = max(0, start_index - 1)
            logger.info(f"Starting from line {current_start_index + 1} as requested.")
        # ---
        items_to_add = []
        total_items_in_file = 0
        if file_path.lower().endswith('.json'):
            if wrap_key:
                logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.")
@ -85,25 +158,23 @@ class QueueManager:
                    logger.error("JSON file must contain a list/array.")
                    return 0
                total_items_in_file = len(data)
                if current_start_index >= total_items_in_file:
                    logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} items). Nothing to push.")
                    return 0
                items_to_process = data[current_start_index:]
                if limit is not None and limit >= 0:
                    items_to_process = items_to_process[:limit]
                # Items can be strings or objects. If objects, they should be converted to JSON strings.
-                items_to_add = []
+                for item in items_to_process:
                for item in data:
                    if isinstance(item, str):
                        items_to_add.append(item.strip())
                    else:
                        items_to_add.append(json.dumps(item))
                items_to_add = [item for item in items_to_add if item]
                pipe = self.redis.pipeline()
                for item in items_to_add:
                    pipe.rpush(queue_name, item)
                    count += 1
                    if count > 0 and count % 1000 == 0:
                        pipe.execute()
                        logger.info(f"Pushed {count} items...")
                pipe.execute()
            except (IOError, json.JSONDecodeError) as e:
                logger.error(f"Failed to read or parse JSON file '{file_path}': {e}")
                return 0
@ -111,24 +182,51 @@ class QueueManager:
            logger.info("Reading items from text file (one per line).")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
-                    pipe = self.redis.pipeline()
+                    all_lines = f.readlines()
-                    for line in f:
+                
                total_items_in_file = len(all_lines)
                if current_start_index >= total_items_in_file:
                    logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} lines). Nothing to push.")
                    return 0
                lines_to_process = all_lines[current_start_index:]
                if limit is not None and limit >= 0:
                    lines_to_process = lines_to_process[:limit]
                for line in lines_to_process:
                    item = line.strip()
                    if item:
                        if wrap_key:
                            payload = json.dumps({wrap_key: item})
                        else:
                            payload = item
-                            pipe.rpush(queue_name, payload)
+                        items_to_add.append(payload)
-                            count += 1
+
                            if count > 0 and count % 1000 == 0:
                                pipe.execute()
                                logger.info(f"Pushed {count} items...")
                    pipe.execute()
            except IOError as e:
                logger.error(f"Failed to read file '{file_path}': {e}")
                return 0
        if items_to_add:
            pipe = self.redis.pipeline()
            for item in items_to_add:
                pipe.rpush(queue_name, item)
                count += 1
                if count > 0 and count % 1000 == 0:
                    pipe.execute()
                    logger.info(f"Pushed {count} of {len(items_to_add)} items...")
            pipe.execute()
        if auto_shift and state_key:
            new_index = current_start_index + count
            # Don't save a new index if we've reached the end of the file.
            # This allows re-running the command to start from the beginning again.
            if new_index >= total_items_in_file:
                self.redis.delete(state_key)
                logger.info(f"Auto-shift: Reached end of file. Cleared saved position for '{os.path.basename(file_path)}'. Next run will start from the beginning.")
            else:
                self.redis.set(state_key, new_index)
                logger.info(f"Auto-shift: Saved next start position for '{os.path.basename(file_path)}' as line {new_index + 1}.")
        logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.")
        return count
@ -230,7 +328,11 @@ def add_queue_manager_parser(subparsers):
    # Push command
    push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser])
    push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
-    push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).')
+    push_parser.add_argument('--count', type=int, default=None, help='Number of items to push. For --from-file, limits the number of lines pushed. For other sources, specifies how many items to generate/push (defaults to 1).')
    shift_group = push_parser.add_mutually_exclusive_group()
    shift_group.add_argument('--start', type=int, help='For --from-file, start pushing from this line number (1-based).')
    shift_group.add_argument('--auto-shift', action='store_true', help="For --from-file, automatically resume from where the last push left off. State is stored in Redis.")
    source_group = push_parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).')
@ -285,10 +387,26 @@ def main_queue_manager(args):
            print(f"INFO: No queue name specified, defaulting to '{default_queue_name}' based on --env='{args.env}'.", file=sys.stderr)
    if args.queue_command == 'list':
-        queues = manager.list_queues(args.pattern)
+        queues_from_redis = manager.list_queues(args.pattern)
        # Discover queues from policy files
        expected_queues_from_policies = _find_configured_queues(env=args.env)
        # Merge Redis results with policy-defined queues
        all_queues_map = {q['name']: q for q in queues_from_redis}
        for q_name in expected_queues_from_policies:
            if q_name not in all_queues_map:
                # Only add if it matches the pattern filter
                if fnmatch.fnmatch(q_name, args.pattern):
                    all_queues_map[q_name] = {'name': q_name, 'size': 0}
        queues = sorted(list(all_queues_map.values()), key=lambda x: x['name'])
        if not queues:
            print(f"No queues found matching pattern '{args.pattern}'.")
            return 0
        if tabulate:
            print(tabulate(queues, headers='keys', tablefmt='grid'))
        else:
@ -309,16 +427,23 @@ def main_queue_manager(args):
            if not os.path.exists(args.file_path):
                print(f"Error: File not found at '{args.file_path}'", file=sys.stderr)
                return 1
-            if args.count > 1:
+            manager.push_from_file(
-                logger.warning("--count is ignored when using --from-file.")
+                args.queue_name,
-            manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json)
+                args.file_path,
                args.wrap_file_line_in_json,
                limit=args.count,
                start_index=args.start,
                auto_shift=args.auto_shift
            )
        elif args.payload_json:
-            manager.push_static(args.queue_name, args.payload_json, args.count)
+            count = args.count if args.count is not None else 1
            manager.push_static(args.queue_name, args.payload_json, count)
        elif args.generate_payload_prefix:
-            if args.count <= 0:
+            count = args.count if args.count is not None else 1
            if count <= 0:
                print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr)
                return 1
-            manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count)
+            manager.push_generated(args.queue_name, args.generate_payload_prefix, count)
        return 0
    elif args.queue_command == 'clear':
--- a/ytops_client-source/ytops_client/stress_policy/arg_parser.py
+++ b/ytops_client-source/ytops_client/stress_policy/arg_parser.py
@ -167,6 +167,7 @@ Overridable Policy Parameters via --set:
    parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.')
    parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.')
    parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)")
    parser.add_argument('--workers', type=int, help='Shortcut to override the total number of workers, capping any discovery logic.')
    parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.")
    parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.')
    parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.")
--- a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py
+++ b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py
@ -952,23 +952,41 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
                        failure_rate = dummy_settings.get('download_failure_rate', 0.0)
                        skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0)
-                        # In dummy mode, prioritize the format from the task file, then from the policy.
+                        # In dummy mode, prioritize the format from the task metadata.
-                        format_selection = info_data.get('_ytops_download_format')
+                        formats_to_test = None
-                        source_of_format = "task file"
+                        source_of_format = "unknown"
-                        if not format_selection:
+
-                            format_selection = d_policy.get('formats', '')
+                        # Prioritize format from task metadata, which supports per-format and per-url tasks.
                        metadata = info_data.get('_ytops_metadata', {})
                        formats_requested = metadata.get('formats_requested')
                        if formats_requested is not None:
                            formats_to_test = formats_requested
                            source_of_format = "task file metadata (_ytops_metadata.formats_requested)"
                        if formats_to_test is None:
                            # Fallback for older task formats or different workflows
                            format_selection_str = info_data.get('_ytops_download_format')
                            if format_selection_str:
                                source_of_format = "task file (_ytops_download_format)"
                                formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
                        if formats_to_test is None:
                            format_selection_str = d_policy.get('formats', '')
                            if format_selection_str:
                                source_of_format = "policy (download_policy.formats)"
                                formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
-                        if not format_selection:
+                        if formats_to_test is None:
                            ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
-                            format_selection = ytdlp_config_overrides.get('format', '')
+                            format_selection_str = ytdlp_config_overrides.get('format', '')
                            if format_selection_str:
                                source_of_format = "policy (ytdlp_config_overrides.format)"
                                formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
-                        if not format_selection:
+                        if formats_to_test is None:
                            logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.")
                            formats_to_test = ['dummy_format']
                        else:
                            formats_to_test = [f.strip() for f in format_selection.split(',') if f.strip()]
                            logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}")
                        for format_id in formats_to_test:
@ -1022,14 +1040,47 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
                        logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========")
-                        # In dummy mode, we just rename the file to processed and continue to the finally block.
+                        # --- Airflow Directory Logic (Dummy Mode) ---
                        success = downloads_processed_in_task > 0
                        if success and d_policy.get('output_to_airflow_ready_dir'):
                            try:
                                video_id = info_data.get('id')
                                if not video_id:
                                    logger.error(f"[{profile_name}] DUMMY: Could not find video ID in '{claimed_task_path_host.name}' for moving files.")
                                else:
                                    # --- Prepare destination directory ---
                                    now = datetime.now()
                                    rounded_minute = (now.minute // 10) * 10
                                    timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
                                    base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready')
                                    if not os.path.isabs(base_path):
                                        base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path)
                                    final_dir_base = os.path.join(base_path, timestamp_str)
                                    final_dir_path = os.path.join(final_dir_base, video_id)
                                    os.makedirs(final_dir_path, exist_ok=True)
                                    # --- Copy info.json ---
                                    new_info_json_name = f"info_{video_id}.json"
                                    dest_info_json_path = os.path.join(final_dir_path, new_info_json_name)
                                    if not os.path.exists(dest_info_json_path):
                                        shutil.copy(str(claimed_task_path_host), dest_info_json_path)
                                        logger.info(f"[{profile_name}] DUMMY: Copied info.json to {dest_info_json_path}")
                            except Exception as e:
                                logger.error(f"[{profile_name}] DUMMY: Failed during post-download processing for Airflow: {e}", exc_info=True)
                        # In dummy mode, we handle file cleanup and continue to the finally block.
                        try:
                            if d_policy.get('remove_source_info_json'):
                                claimed_task_path_host.unlink()
                                logger.debug(f"DUMMY MODE: Removed processed task file '{claimed_task_path_host.name}'.")
                            else:
                                base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0]
                                processed_path = Path(f"{base_path_str}.processed")
                                claimed_task_path_host.rename(processed_path)
                                logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.")
                        except (OSError, IndexError) as e:
-                            logger.error(f"DUMMY MODE: Failed to rename processed task file '{claimed_task_path_host}': {e}")
+                            logger.error(f"DUMMY MODE: Failed to clean up processed task file '{claimed_task_path_host}': {e}")
                        continue # Skip to finally block
@ -1329,15 +1380,19 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
                    # 6. Clean up task file
                    if not queue_policy:
-                        # File-based mode: rename to .processed
+                        # File-based mode: rename to .processed or remove
                        try:
                            if success and d_policy.get('remove_source_info_json'):
                                claimed_task_path_host.unlink()
                                logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Removed processed task file.")
                            else:
                                # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed
                                base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0]
                                processed_path = Path(f"{base_path_str}.processed")
                                claimed_task_path_host.rename(processed_path)
                                logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.")
                        except (OSError, IndexError) as e:
-                            logger.error(f"Failed to rename processed task file '{claimed_task_path_host}': {e}")
+                            logger.error(f"Failed to clean up processed task file '{claimed_task_path_host}': {e}")
                    elif d_policy.get('rename_source_info_json_on_success'):
                        # Queue-based mode: respect rename policy
                        source_path_to_rename = task.get('info_json_path')
@ -1372,11 +1427,25 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
                                    # but the task is being finalized, we must assume all potential downloads for this task
                                    # are "processed" to prevent the auth profile from getting stuck.
                                    if downloads_processed_in_task == 0:
-                                        logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.")
+                                        logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using task metadata to determine decrement count to avoid stuck profile.")
-                                        ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
+                                        
-                                        formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', ''))
+                                        decrement_count = 1 # Default to 1 to be safe
-                                        num_formats = formats_str.count(',') + 1 if formats_str else 1
+                                        metadata = info_data.get('_ytops_metadata', {})
-                                        downloads_processed_in_task = num_formats
+                                        granularity = metadata.get('download_task_granularity')
                                        formats_requested = metadata.get('formats_requested') # Can be None
                                        if granularity == 'per_format':
                                            # Each task file represents one format group, so decrement by 1.
                                            decrement_count = 1
                                        elif granularity == 'per_url':
                                            # The task file represents all formats for a URL.
                                            decrement_count = len(formats_requested) if formats_requested else 1
                                        else:
                                            # No granularity info, this may be an older task file.
                                            # Assume it's a single download task.
                                            decrement_count = 1
                                        downloads_processed_in_task = decrement_count
                                        logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}")
                                    if downloads_processed_in_task > 0:
--- a/ytops_client-source/ytops_client/stress_policy/queue_workers.py
+++ b/ytops_client-source/ytops_client/stress_policy/queue_workers.py
@ -30,192 +30,18 @@ from .queue_provider import RedisQueueProvider
 logger = logging.getLogger(__name__)
-def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock):
+def _process_single_auth_result(success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url, worker_id, policy, state_manager, args, profile_manager_instance, info_json_path=None):
-    """Worker function for processing authentication tasks from a queue in batches."""
+    """
-    owner_id = f"queue-auth-worker-{worker_id}"
+    Helper to process the result of a single authentication attempt, whether it was
    part of a batch or a single run.
    """
    settings = policy.get('settings', {})
    exec_control = policy.get('execution_control', {})
    gen_policy = policy.get('info_json_generation_policy', {})
    queue_policy = policy.get('queue_policy', {})
    gen_policy = policy.get('info_json_generation_policy', {})
    task_granularity = gen_policy.get('download_task_granularity', 'per_format')
    profile_prefix = gen_policy.get('profile_prefix')
    if not profile_prefix:
        logger.error(f"[Worker {worker_id}] Queue auth mode requires 'info_json_generation_policy.profile_prefix'. Worker exiting.")
        return []
    save_dir = settings.get('save_info_json_dir')
    if not save_dir and queue_policy.get('formats_to_download'):
        save_dir = os.path.join('run', 'stress_policy', 'info_jsons')
        logger.info(f"[Worker {worker_id}] 'formats_to_download' is set and 'save_info_json_dir' is not, defaulting to '{save_dir}'")
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        logger.info(f"[Worker {worker_id}] Will save info.json files to '{save_dir}'")
    batch_size = queue_policy.get('batch_size', 1)
    logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
    task_counter = 0
    last_no_task_log_msg = ""
    while not state_manager.shutdown_event.is_set():
        locked_profile = None
        tasks = []
        try:
            # 1. Get a batch of tasks from the queue FIRST
            tasks = state_manager.get_auth_tasks_batch(batch_size)
            if not tasks:
                polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
                base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
                if base_log_msg == last_no_task_log_msg:
                    print(".", end="", file=sys.stdout, flush=True)
                else:
                    if last_no_task_log_msg:
                        print(file=sys.stdout)
                    logger.info(base_log_msg)
                    last_no_task_log_msg = base_log_msg
                time.sleep(polling_interval)
                continue
            if last_no_task_log_msg:
                print(file=sys.stderr)
            last_no_task_log_msg = ""
            # 2. Lock a profile, trying any of the specified prefixes
            locked_profile = None
            prefixes_to_try = []
            if profile_prefix:
                prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
            if prefixes_to_try:
                random.shuffle(prefixes_to_try)
                for prefix in prefixes_to_try:
                    locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
                    if locked_profile:
                        break
            else:
                # Fallback for empty/no prefix, which means lock any available profile
                locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
            if not locked_profile:
                polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
                logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
                state_manager.add_auth_tasks_batch(tasks)
                time.sleep(polling_interval)
                continue
            profile_name = locked_profile['name']
            proxy_url = locked_profile['proxy']
            logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
            # 3. Process each task in the batch
            for task in tasks:
                if state_manager.shutdown_event.is_set():
                    logger.info(f"[Worker {worker_id}] Shutdown requested, stopping batch processing for profile '{profile_name}'.")
                    break
                temp_task_dir = None
                task_id = None
                url = None
                try:
                    temp_task_dir = tempfile.mkdtemp(prefix=f"queue-auth-{worker_id}-")
                    task_id = task.get('id') or task.get('task_id')
                    if not task_id:
                        task_id = f"task_{worker_id}_{task_counter}"
                        task_counter += 1
                        task['task_id'] = task_id
    url = task.get('url')
                    if not url:
                        logger.error(f"[Worker {worker_id}] Task {task_id} has no URL. Skipping.")
                        state_manager.report_auth_skipped(task_id, {"error": "No URL in task", "task": task})
                        continue
                    logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task {task_id}: {url}")
                    state_manager.mark_auth_in_progress(task_id, owner_id)
                    # --- Main processing logic for a single task ---
                    success, info_data, stderr, retcode = False, None, "", 0
                    if args.dummy or args.dummy_batch:
                        logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}")
                        dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
                        min_seconds = dummy_settings.get('auth_min_seconds', 0.1)
                        max_seconds = dummy_settings.get('auth_max_seconds', 0.5)
                        failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0)
                        skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0)
                        time.sleep(random.uniform(min_seconds, max_seconds))
                        rand_val = random.random()
                        if rand_val < skipped_rate:
                            stderr = "Dummy skipped failure"
                        elif rand_val < (skipped_rate + failure_rate):
                            stderr = "Dummy fatal failure"
                        else:
                            success = True
                            # In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id'
                            try:
                                # First, try the robust library function
                                video_id = sp_utils.get_video_id(url)
                                if not video_id and url:
                                    # Fallback parsing for dummy URLs like "...?v=dummy_0099"
                                    parsed_url = urlparse(url)
                                    video_id = parse_qs(parsed_url.query).get('v', [None])[0]
                                    # Additional fallback for URLs like "youtube.com/watch?v=dummy_0028"
                                    if not video_id and 'dummy_' in url:
                                        dummy_match = re.search(r'dummy_\d+', url)
                                        if dummy_match:
                                            video_id = dummy_match.group(0)
                            except Exception:
                                video_id = None # Ensure video_id is None on parsing failure
                            # If all extraction methods fail, use the task_id as a reliable fallback.
                            if not video_id:
                                logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.")
                                video_id = task_id
                            info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
                    else:
                        client, req_params = state_manager.get_client_for_request(profile_name, gen_policy)
                        cmd = [
                            sys.executable, '-m', 'ytops_client.cli', 'get-info',
                            '--client', client or '', '--profile', profile_name
                        ]
                        if proxy_url:
                            cmd.extend(['--proxy', proxy_url])
                        if req_params:
                            cmd.extend(['--request-params', json.dumps(req_params)])
                        extra_args = gen_policy.get('extra_args')
                        if extra_args:
                            cmd.extend(shlex.split(extra_args))
                        # The URL must be the last positional argument
                        cmd.append(url)
                        logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}")
                        retcode, stdout, stderr = run_command(
                            cmd, running_processes, process_lock, stream_output=args.verbose,
                            stream_prefix=f"[Worker {worker_id} | get-info] "
                        )
                        success = (retcode == 0)
                        if success:
                            info_json_path = next((line.strip() for line in stdout.strip().split('\n') if line.endswith('.json') and os.path.exists(line.strip())), None)
                            if info_json_path:
                                try:
                                    with open(info_json_path, 'r', encoding='utf-8') as f:
                                        info_data = json.load(f)
                                except (IOError, json.JSONDecodeError) as e:
                                    logger.error(f"[Worker {worker_id}] Failed to read/parse info.json from get-info: {e}")
                                    success = False
                                    stderr += f"\nFailed to read/parse info.json: {e}"
                            else:
                                logger.error(f"[Worker {worker_id}] Command succeeded but no info.json path found in output.")
                                success = False
                                stderr += "\nNo info.json path in output"
                    # --- Result processing for a single task ---
    if success and info_data:
        try:
            auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '')
@ -360,13 +186,14 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
    else:
        is_bot_error = "Sign in to confirm you're not a bot" in stderr
        is_timeout_error = "Read timed out" in stderr
        is_proxy_error = "ProxyError" in stderr or "Unable to connect to proxy" in stderr
        is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr
        is_private = "This video is private" in stderr or "Private video" in stderr
        is_deleted = "This video has been removed" in stderr
        is_dummy_skipped = "Dummy skipped failure" in stderr
-                        if is_unavailable or is_private or is_deleted or is_dummy_skipped:
+        if is_unavailable or is_private or is_deleted or is_dummy_skipped or is_proxy_error:
-                            reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped"
+            reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Proxy error" if is_proxy_error else "Dummy skipped"
            logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}")
            profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
            state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr})
@ -376,14 +203,309 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
            profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
            state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode})
 def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock):
    """Worker function for processing authentication tasks from a queue in batches."""
    owner_id = f"queue-auth-worker-{worker_id}"
    settings = policy.get('settings', {})
    exec_control = policy.get('execution_control', {})
    gen_policy = policy.get('info_json_generation_policy', {})
    queue_policy = policy.get('queue_policy', {})
    docker_policy = policy.get('direct_docker_cli_policy', {})
    task_granularity = gen_policy.get('download_task_granularity', 'per_format')
    profile_prefix = gen_policy.get('profile_prefix')
    if not profile_prefix:
        logger.error(f"[Worker {worker_id}] Queue auth mode requires 'info_json_generation_policy.profile_prefix'. Worker exiting.")
        return []
    save_dir = settings.get('save_info_json_dir')
    if not save_dir and queue_policy.get('formats_to_download'):
        save_dir = os.path.join('run', 'stress_policy', 'info_jsons')
        logger.info(f"[Worker {worker_id}] 'formats_to_download' is set and 'save_info_json_dir' is not, defaulting to '{save_dir}'")
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        logger.info(f"[Worker {worker_id}] Will save info.json files to '{save_dir}'")
    batch_size = queue_policy.get('batch_size', 1)
    logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
    task_counter = 0
    last_no_task_log_msg = ""
    while not state_manager.shutdown_event.is_set():
        locked_profile = None
        tasks = []
        try:
            # 1. Get a batch of tasks from the queue FIRST
            raw_tasks = state_manager.get_auth_tasks_batch(batch_size)
            if not raw_tasks:
                polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
                base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
                if base_log_msg == last_no_task_log_msg:
                    print(".", end="", file=sys.stdout, flush=True)
                else:
                    if last_no_task_log_msg:
                        print(file=sys.stdout)
                    logger.info(base_log_msg)
                    last_no_task_log_msg = base_log_msg
                time.sleep(polling_interval)
                continue
            # Normalize tasks to handle both raw URL strings and JSON objects
            tasks = []
            for raw_task in raw_tasks:
                if isinstance(raw_task, str):
                    tasks.append({'url': raw_task})
                elif isinstance(raw_task, dict) and raw_task.get('url'):
                    tasks.append(raw_task)
                else:
                    logger.warning(f"[Worker {worker_id}] Skipping invalid task of type {type(raw_task)}: {raw_task}")
            if not tasks:
                logger.warning(f"[Worker {worker_id}] All tasks in batch were invalid. Waiting for next batch.")
                time.sleep(1)
                continue
            if last_no_task_log_msg:
                print(file=sys.stderr)
            last_no_task_log_msg = ""
            # 2. Lock a profile, trying any of the specified prefixes
            locked_profile = None
            prefixes_to_try = []
            if profile_prefix:
                prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
            if prefixes_to_try:
                random.shuffle(prefixes_to_try)
                for prefix in prefixes_to_try:
                    locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
                    if locked_profile:
                        break
            else:
                # Fallback for empty/no prefix, which means lock any available profile
                locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
            if not locked_profile:
                polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
                logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
                state_manager.add_auth_tasks_batch(tasks)
                time.sleep(polling_interval)
                continue
            profile_name = locked_profile['name']
            proxy_url = locked_profile['proxy']
            logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
            # 3. Process the entire batch
            if args.dummy or args.dummy_batch:
                # Dummy mode is not optimized for batching, so we process tasks individually.
                logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Processing {len(tasks)} tasks individually.")
                for task in tasks:
                    if state_manager.shutdown_event.is_set():
                        break
                    task_id = task.get('id') or task.get('task_id') or f"task_{worker_id}_{task_counter}"
                    url = task.get('url')
                    # Dummy simulation logic copied from single-task implementation
                    logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}")
                    dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
                    min_seconds = dummy_settings.get('auth_min_seconds', 0.1)
                    max_seconds = dummy_settings.get('auth_max_seconds', 0.5)
                    failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0)
                    skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0)
                    time.sleep(random.uniform(min_seconds, max_seconds))
                    success, info_data, stderr, retcode = False, None, "", 0
                    rand_val = random.random()
                    if rand_val < skipped_rate:
                        stderr = "Dummy skipped failure"
                    elif rand_val < (skipped_rate + failure_rate):
                        stderr = "Dummy fatal failure"
                    else:
                        success = True
                        video_id = sp_utils.get_video_id(url) or task_id
                        info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
                    _process_single_auth_result(
                        success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url,
                        worker_id, policy, state_manager, args, profile_manager_instance
                    )
            else:
                # BATCH MODE: Process all tasks in a single Docker run
                batch_temp_dir = None
                try:
                    # Create a temporary directory for this batch's files
                    host_mount_path = docker_policy.get('docker_host_mount_path')
                    temp_parent_dir = host_mount_path if host_mount_path and os.path.isdir(host_mount_path) else None
                    batch_temp_dir = tempfile.mkdtemp(prefix=f"queue-auth-batch-{worker_id}-", dir=temp_parent_dir)
                    task_dir_name = os.path.basename(batch_temp_dir)
                    task_dir_container = os.path.join(docker_policy.get('docker_container_mount_path'), task_dir_name)
                    # Map URLs to their original tasks for post-processing
                    url_to_task_map = {task.get('url'): task for task in tasks if task.get('url')}
                    # Create a batch file with all URLs
                    batch_file_host_path = os.path.join(batch_temp_dir, 'batch_urls.txt')
                    with open(batch_file_host_path, 'w', encoding='utf-8') as f:
                        for url in url_to_task_map.keys():
                            f.write(f"{url}\n")
                    # Mark all tasks in the batch as in-progress
                    for task in tasks:
                        task_id = task.get('id') or task.get('task_id')
                        if not task_id:
                            # Generate a temporary, worker-unique ID if none exists.
                            task_id = f"task_{worker_id}_{int(time.time()*1000)}_{task_counter}"
                            task['task_id'] = task_id # Store it back for later stages
                            task_counter += 1
                            logger.warning(f"[Worker {worker_id}] Task missing 'id' or 'task_id'. Generated a temporary one: {task_id}. Task content: {task}")
                        state_manager.mark_auth_in_progress(task_id, owner_id)
                    # Docker batch processing logic
                    image_name = docker_policy.get('docker_image_name')
                    if not image_name:
                        raise ValueError("'direct_docker_cli_policy.docker_image_name' is not defined.")
                    network_name = docker_policy.get('docker_network_name')
                    container_mount_path = docker_policy.get('docker_container_mount_path')
                    host_cache_path = docker_policy.get('docker_host_cache_path')
                    container_cache_path = docker_policy.get('docker_container_cache_path')
                    volumes = {}
                    if host_mount_path and container_mount_path:
                        volumes[str(Path(host_mount_path).resolve())] = {'bind': container_mount_path, 'mode': 'rw'}
                    if host_cache_path and container_cache_path:
                        Path(host_cache_path).mkdir(parents=True, exist_ok=True)
                        volumes[str(Path(host_cache_path).resolve())] = {'bind': container_cache_path, 'mode': 'rw'}
                    environment = {}
                    if host_cache_path and container_cache_path:
                        environment['XDG_CACHE_HOME'] = container_cache_path
                    # --- Build config file content ---
                    base_config_content = ""
                    base_config_file = docker_policy.get('ytdlp_config_file')
                    if base_config_file:
                        config_path_to_read = Path(base_config_file)
                        if not config_path_to_read.exists():
                            config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file
                        if config_path_to_read.exists():
                            try:
                                with open(config_path_to_read, 'r', encoding='utf-8') as f:
                                    base_config_content = f.read()
                            except IOError as e:
                                logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}")
                        else:
                             logger.error(f"[Worker {worker_id}] Could not find ytdlp_config_file: '{base_config_file}'")
                    config_overrides = docker_policy.get('ytdlp_config_overrides', {}).copy()
                    config_overrides['proxy'] = proxy_url
                    config_overrides['batch-file'] = os.path.join(task_dir_container, 'batch_urls.txt')
                    config_overrides['output'] = os.path.join(task_dir_container, '%(id)s')
                    overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides)
                    raw_args_from_policy = docker_policy.get('ytdlp_raw_args', [])
                    raw_args_content = '\n'.join(raw_args_from_policy)
                    config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}"
                    if raw_args_content:
                        config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}"
                    # Write the combined config to a file in the temp batch dir
                    ytdlp_config_dir_host = os.path.join(batch_temp_dir, 'yt-dlp')
                    os.makedirs(ytdlp_config_dir_host, exist_ok=True)
                    temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config')
                    with open(temp_config_file_host, 'w', encoding='utf-8') as f:
                        f.write(config_content)
                    # The command tells yt-dlp exactly where to find the config file we created.
                    cmd = ['yt-dlp', '--config-locations', os.path.join(task_dir_container, 'yt-dlp/config')]
                    processed_urls = set()
                    activity_lock = threading.Lock()
                    def log_parser_callback(line):
                        # Not thread-safe, but only one thread writes to this set
                        nonlocal processed_urls
                        if '[info] Writing video metadata as JSON to:' in line:
                            try:
                                path_match = re.search(r"Writing video metadata as JSON to: '?([^']+)'?$", line)
                                if not path_match:
                                    path_match = re.search(r"Writing video metadata as JSON to: (.*)$", line)
                                if path_match:
                                    container_file_path = path_match.group(1).strip()
                                    if container_file_path.startswith(container_mount_path):
                                        relative_path = os.path.relpath(container_file_path, container_mount_path)
                                        host_file_path = os.path.join(host_mount_path, relative_path)
                                        # Wait for file to exist
                                        for _ in range(5):
                                            if os.path.exists(host_file_path): break
                                            time.sleep(0.1)
                                        if os.path.exists(host_file_path):
                                            with open(host_file_path, 'r', encoding='utf-8') as f:
                                                info_data = json.load(f)
                                            original_url = info_data.get('original_url') or info_data.get('webpage_url')
                                            if original_url in url_to_task_map:
                                                with activity_lock:
                                                    if original_url not in processed_urls:
                                                        processed_urls.add(original_url)
                                                        task = url_to_task_map[original_url]
                                                        task_id = task.get('id') or task.get('task_id')
                                                        _process_single_auth_result(
                                                            True, info_data, "", 0, task, task_id, profile_name, proxy_url,
                                                            worker_id, policy, state_manager, args, profile_manager_instance,
                                                            info_json_path=host_file_path
                                                        )
                                            else:
                                                logger.warning(f"[Worker {worker_id}] Could not map info.json '{Path(host_file_path).name}' with URL '{original_url}' to an original task. Skipping.")
                            except (IOError, json.JSONDecodeError, KeyError) as e:
                                logger.error(f"[Worker {worker_id}] Error during live post-processing from log line: {e}")
                        return False # Don't stop the container
                    logger.info(f"[Worker {worker_id}] [{profile_name}] Running Docker container for batch of {len(tasks)} URLs.")
                    retcode, stdout, stderr, _ = run_docker_container(
                        image_name=image_name, command=cmd, volumes=volumes,
                        environment=environment, network_name=network_name,
                        stream_prefix=f"[Worker {worker_id} | Docker] ",
                        log_callback=log_parser_callback
                    )
                    # --- Post-processing for failed URLs ---
                    unprocessed_urls = set(url_to_task_map.keys()) - processed_urls
                    if unprocessed_urls:
                        logger.warning(f"[Worker {worker_id}] [{profile_name}] {len(unprocessed_urls)} URLs from the batch did not produce an info.json and will be marked as failed.")
                        for url in unprocessed_urls:
                            task = url_to_task_map[url]
                            task_id = task.get('id') or task.get('task_id')
                            _process_single_auth_result(
                                False, None, stderr, retcode, task, task_id, profile_name, proxy_url,
                                worker_id, policy, state_manager, args, profile_manager_instance
                            )
                except Exception as e:
-                    logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True)
+                    logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error during batch processing: {e}", exc_info=True)
-                    if task_id:
+                    for task in tasks:
-                        state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"})
+                        task_id = task.get('id') or task.get('task_id')
-                    profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
+                        _process_single_auth_result(
                            False, None, f"Unexpected batch error: {str(e)}", -1, task, task_id, profile_name, proxy_url,
                            worker_id, policy, state_manager, args, profile_manager_instance
                        )
                finally:
-                    if temp_task_dir and os.path.exists(temp_task_dir):
+                    if batch_temp_dir and os.path.exists(batch_temp_dir):
-                        shutil.rmtree(temp_task_dir)
+                        shutil.rmtree(batch_temp_dir)
                    for task in tasks:
                        task_id = task.get('id') or task.get('task_id')
                        if task_id:
                            state_manager.remove_auth_in_progress(task_id)
@ -437,24 +559,41 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
    settings = policy.get('settings', {})
    exec_control = policy.get('execution_control', {})
    d_policy = policy.get('download_policy', {})
    direct_policy = policy.get('direct_docker_cli_policy', {})
    queue_policy = policy.get('queue_policy', {})
    profile_prefix = d_policy.get('profile_prefix')
    if not profile_prefix:
-        logger.error(f"[Worker {worker_id}] Queue download mode requires 'download_policy.profile_prefix'. Worker exiting.")
+        logger.error(f"[Worker {worker_id}] Queue download mode requires a profile prefix, but none was provided to the worker. Check 'execution_control' in policy. Worker exiting.")
        return []
-    output_dir = d_policy.get('output_dir')
+    # --- Docker specific config for download worker ---
-    if output_dir:
+    image_name = direct_policy.get('docker_image_name')
-        os.makedirs(output_dir, exist_ok=True)
+    host_mount_path = direct_policy.get('docker_host_mount_path')
-        logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'")
+    container_mount_path = direct_policy.get('docker_container_mount_path')
    host_download_path = direct_policy.get('docker_host_download_path')
    container_download_path = direct_policy.get('docker_container_download_path')
    network_name = direct_policy.get('docker_network_name')
    if not all([image_name, host_mount_path, container_mount_path, host_download_path, container_download_path]):
        logger.error(f"[Worker {worker_id}] Queue docker download mode requires all docker_* keys in 'direct_docker_cli_policy'. Worker exiting.")
        return []
    try:
        os.makedirs(host_mount_path, exist_ok=True)
        os.makedirs(host_download_path, exist_ok=True)
    except OSError as e:
        logger.error(f"[Worker {worker_id}] Could not create required host directories: {e}. Worker exiting.")
        return []
    task_counter = 0
    last_no_task_log_msg = ""
    while not state_manager.shutdown_event.is_set():
        locked_profile = None
-        temp_task_dir = None
+        temp_task_dir = None # Used by dummy mode
        temp_config_dir_host = None # Used by docker mode
        was_banned_by_parser = False
        task = None
        auth_profile_name, auth_env = None, None
@ -594,105 +733,132 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
                    # race conditions between workers processing different formats for the
                    # same video. The files can be cleaned up manually between runs.
            else:
-                cmd = [
+                retcode = -1; stdout = ""; stderr = ""; stop_reason = None
-                    sys.executable, '-m', 'ytops_client.cli', 'download', 'py',
+                live_success_count, live_failure_count, live_tolerated_count = 0, 0, 0
                    '--load-info-json', info_json_path,
                    '-f', format_id
                ]
                if proxy_url:
                    cmd.extend(['--proxy', proxy_url])
                if output_dir:
                    cmd.extend(['--output-dir', output_dir])
                extra_args = d_policy.get('extra_args')
                if extra_args:
                    cmd.extend(shlex.split(extra_args))
-                logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}")
+                # --- Pre-flight checks ---
-                retcode, stdout, stderr = run_command(
+                if d_policy.get('check_url_expiration', True):
-                    cmd, running_processes, process_lock, stream_output=args.verbose,
+                    try:
-                    stream_prefix=f"[Worker {worker_id} | download] "
+                        with open(info_json_path, 'r', encoding='utf-8') as f: info_data = json.load(f)
                        first_format = next((f for f in info_data.get('formats', []) if 'url' in f), None)
                        if first_format:
                            status, _ = sp_utils.check_url_expiry(first_format['url'], d_policy.get('expire_time_shift_minutes', 0))
                            if status == 'expired': stderr = "Download URL is expired"
                    except (IOError, json.JSONDecodeError) as e: stderr = f"Could not read info.json: {e}"
                if not stderr:
                    relative_task_path = os.path.relpath(os.path.abspath(info_json_path), host_mount_path)
                    task_path_container = os.path.join(container_mount_path, relative_task_path)
                    temp_config_dir_host = tempfile.mkdtemp(prefix=f"docker-dl-config-{worker_id}-", dir=host_mount_path)
                    config_dir_container = os.path.join(container_mount_path, os.path.basename(temp_config_dir_host))
                    base_config_content = ""
                    base_config_file = direct_policy.get('ytdlp_config_file')
                    if base_config_file:
                        config_path_to_read = Path(base_config_file)
                        if not config_path_to_read.exists(): config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file
                        if config_path_to_read.exists():
                            try:
                                with open(config_path_to_read, 'r', encoding='utf-8') as f: base_config_content = f.read()
                            except IOError as e: logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}")
                    config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy()
                    config_overrides.update({
                        'proxy': proxy_url, 'load-info-json': task_path_container,
                        'output': os.path.join(container_download_path, '%(id)s.f%(format_id)s.%(ext)s'),
                        'format': format_id, 'no-cache-dir': True
                    })
                    overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides)
                    raw_args_content = '\n'.join(direct_policy.get('ytdlp_raw_args', []))
                    config_content = f"{base_config_content.strip()}\n\n# --- Overrides ---\n{overrides_content}\n{raw_args_content}"
                    ytdlp_config_dir_host = os.path.join(temp_config_dir_host, 'yt-dlp')
                    os.makedirs(ytdlp_config_dir_host, exist_ok=True)
                    with open(os.path.join(ytdlp_config_dir_host, 'config'), 'w', encoding='utf-8') as f: f.write(config_content)
                    volumes = {
                        os.path.abspath(host_mount_path): {'bind': container_mount_path, 'mode': 'ro'},
                        os.path.abspath(host_download_path): {'bind': container_download_path, 'mode': 'rw'}
                    }
                    command = ['yt-dlp', '--config-locations', os.path.join(config_dir_container, 'yt-dlp/config')]
                    activity_lock = threading.Lock()
                    tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', [])
                    fatal_error_patterns = direct_policy.get('fatal_error_patterns', [])
                    def log_parser_callback(line):
                        nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser
                        if '[download] 100% of' in line or 'has already been downloaded' in line:
                            with activity_lock: live_success_count += 1; profile_manager_instance.record_activity(profile_name, 'download', is_dummy=False)
                            return False
                        for pattern in fatal_error_patterns:
                            if re.search(pattern, line, re.IGNORECASE):
                                with activity_lock:
                                    live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False)
                                    if direct_policy.get('ban_on_fatal_error_in_batch'):
                                        profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download')
                                        was_banned_by_parser = True; return True
                                return False
                        if 'ERROR:' not in line: return False
                        for pattern in tolerated_error_patterns:
                            if re.search(pattern, line, re.IGNORECASE):
                                with activity_lock: live_tolerated_count += 1; profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=False)
                                return False
                        with activity_lock: live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False)
                        return False
                    retcode, stdout, stderr, stop_reason = run_docker_container(
                        image_name=image_name, command=command, volumes=volumes, stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ",
                        network_name=network_name, log_callback=log_parser_callback, profile_manager=profile_manager_instance,
                        profile_name=profile_name, environment={}
                    )
-                success = (retcode == 0)
+                
-                if success:
+                success = live_success_count > 0 or (retcode == 0 and not stop_reason and live_failure_count == 0)
                    for line in stdout.strip().split('\n'):
                        if os.path.exists(line.strip()):
                            downloaded_filepath = line.strip()
                            break
            if success:
-                # --- Airflow Integration ---
+                downloaded_filepath = None
                if d_policy.get('output_to_airflow_ready_dir'):
                    base_path = d_policy.get('airflow_ready_dir_base_path')
                    if not base_path:
-                        logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set. Skipping Airflow output.")
+                        logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set.")
                    else:
                        try:
                            # Create a unique, timestamped directory for the video
                            ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S_%f')
                            final_dir = os.path.join(base_path, ts + '_' + (video_id or 'unknown_video'))
                            os.makedirs(final_dir, exist_ok=True)
                            # Copy info.json to avoid a race condition where multiple download workers
                            # for different formats try to move the same source file.
                            if info_json_path and os.path.exists(info_json_path):
-                                final_info_json_path = os.path.join(final_dir, os.path.basename(info_json_path))
+                                shutil.copy(info_json_path, os.path.join(final_dir, os.path.basename(info_json_path)))
                                shutil.copy(info_json_path, final_info_json_path)
                                logger.info(f"[Worker {worker_id}] Copied info.json to Airflow-ready dir: {final_info_json_path}")
                                # Update the path for reporting to point to the new copy
                                info_json_path = final_info_json_path
-                            # Move downloaded file (if not in dummy mode)
+                            downloaded_files = [f for f in os.listdir(host_download_path) if not f.endswith(('.part', '.ytdl'))]
-                            if downloaded_filepath and os.path.exists(downloaded_filepath):
+                            if not downloaded_files: logger.warning(f"No media files found in '{host_download_path}' for video '{video_id}'.")
                                final_media_path = os.path.join(final_dir, os.path.basename(downloaded_filepath))
                                shutil.move(downloaded_filepath, final_media_path)
                                logger.info(f"[Worker {worker_id}] Moved media file to Airflow-ready dir: {final_media_path}")
                                # Update the path for reporting
                                downloaded_filepath = final_media_path
-                        except Exception as e:
+                            for filename in downloaded_files:
-                            logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True)
+                                final_media_path = os.path.join(final_dir, filename)
                                shutil.move(os.path.join(host_download_path, filename), final_media_path)
                                if downloaded_filepath is None: downloaded_filepath = final_media_path
                        except Exception as e: logger.error(f"Failed to move files to Airflow-ready dir: {e}", exc_info=True)
                profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
                state_manager.report_download_success(task_id, {
-                    "video_id": video_id, "url": url, "format_id": format_id,
+                    "video_id": video_id, "url": url, "format_id": format_id, "profile_name": profile_name, "proxy_url": proxy_url,
                    "profile_name": profile_name, "proxy_url": proxy_url,
                    "downloaded_filepath": downloaded_filepath, "info_json_path": info_json_path
                })
                logger.info(f"[Worker {worker_id}] Download successful: {video_id or url} format {format_id}")
                # --- Rename source info.json to mark as processed ---
                if d_policy.get('rename_source_info_json_on_success'):
-                    # Use the original path from the task, as the `info_json_path` variable
+                    source_path = task.get('info_json_path')
-                    # may have been updated by the Airflow logic to point to a copy.
+                    if source_path and os.path.exists(source_path):
-                    source_path_to_rename = task.get('info_json_path')
+                        try: shutil.move(source_path, source_path + ".processed")
-                    if source_path_to_rename and os.path.exists(source_path_to_rename):
+                        except Exception as e: logger.warning(f"Could not rename source info.json '{source_path}': {e}")
                        try:
                            processed_path = source_path_to_rename + ".processed"
                            shutil.move(source_path_to_rename, processed_path)
                            logger.info(f"[Worker {worker_id}] Renamed source info.json to '{processed_path}'")
                        except Exception as e:
                            logger.warning(f"[Worker {worker_id}] Could not rename source info.json '{source_path_to_rename}': {e}")
            else:
-                is_bot_error = "Sign in to confirm you're not a bot" in stderr
+                is_unavailable = "unavailable" in stderr.lower() or "expired" in stderr.lower()
                is_timeout_error = "Read timed out" in stderr
                is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr
                is_format_error = "requested format not available" in stderr
                if is_unavailable or is_format_error:
-                    logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}")
+                    reason = "Video/URL unavailable or expired" if is_unavailable else "Format not available"
-                    profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
+                    state_manager.report_download_skipped(task_id, {"video_id": video_id, "url": url, "format_id": format_id, "reason": reason, "stderr": stderr})
                    state_manager.report_download_skipped(task_id, {
                        "video_id": video_id, "url": url, "format_id": format_id,
                        "reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr
                    })
                else:
                    error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}"
                    logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}")
                    profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
                    state_manager.report_download_failure(task_id, {
-                        "video_id": video_id, "url": url, "format_id": format_id,
+                        "video_id": video_id, "url": url, "format_id": format_id, "error_type": f"Exit code {retcode}",
-                        "error_type": error_type, "stderr": stderr, "exit_code": retcode,
+                        "stderr": stderr, "exit_code": retcode, "original_task": task
                        "original_task": task
                    })
        except Exception as e:
@ -722,6 +888,9 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
                    logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.")
            if locked_profile:
                if was_banned_by_parser:
                    logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was banned by log parser. Skipping unlock.")
                else:
                    cooldown = None
                    cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds')
                    if cooldown_config:
@ -732,7 +901,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
                            elif isinstance(val, int):
                                cooldown = val
                        except (json.JSONDecodeError, TypeError):
-                        if cooldown_config.isdigit():
+                            if isinstance(cooldown_config, str) and cooldown_config.isdigit():
                                cooldown = int(cooldown_config)
                    if cooldown:
@ -747,6 +916,9 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
            if temp_task_dir and os.path.exists(temp_task_dir):
                shutil.rmtree(temp_task_dir)
            if temp_config_dir_host and os.path.exists(temp_config_dir_host):
                shutil.rmtree(temp_config_dir_host)
            if task and task_id:
                state_manager.remove_download_in_progress(task_id)
--- a/ytops_client-source/ytops_client/stress_policy_tool.py
+++ b/ytops_client-source/ytops_client/stress_policy_tool.py
@ -130,6 +130,47 @@ process_lock = threading.Lock()
 logger = logging.getLogger('stress_policy_tool')
 def _discover_worker_pools(discovery_config, manager_for_discovery):
    """
    Discovers worker pools by scanning profile prefixes in Redis.
    Returns a list of worker pool configurations or None on error.
    """
    discovery_pattern = discovery_config.get('profile_prefix_pattern')
    workers_per_group = discovery_config.get('workers_per_profile_group', 1)
    if not discovery_pattern:
        logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
        return None
    logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
    try:
        all_profiles = manager_for_discovery.list_profiles()
        found_prefixes = set()
        for profile in all_profiles:
            profile_name = profile['name']
            if fnmatch.fnmatch(profile_name, discovery_pattern):
                # Assuming standard name format like 'user31_001', extract 'user31'
                prefix = profile_name.rsplit('_', 1)[0]
                found_prefixes.add(prefix)
        if not found_prefixes:
            logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
            return []
        else:
            worker_pools = []
            for prefix in sorted(list(found_prefixes)):
                worker_pools.append({
                    'profile_prefix': prefix,
                    'workers': workers_per_group
                })
            logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
            logger.info("Note: Profile group discovery runs once at startup. A restart is required to detect new profile groups.")
            return worker_pools
    except Exception as e:
        logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
        return None
 def main_stress_policy(args):
    """Main logic for the 'stress-policy' command."""
    if args.list_policies:
@ -362,10 +403,21 @@ def main_stress_policy(args):
            logger.info(f"\nSignal {signum} received, shutting down gracefully...")
            shutdown_event.set()
            # Propagate signal to child processes to allow them to shut down gracefully.
            with process_lock:
                if running_processes:
                    logger.info(f"Propagating signal to {len(running_processes)} running subprocess(es)...")
                    for p in running_processes:
                        try:
                            # Send the same signal to the entire process group.
                            os.killpg(os.getpgid(p.pid), signum)
                        except (ProcessLookupError, PermissionError):
                            pass  # Process already finished or we lack permissions
            # Save state immediately to prevent loss on interrupt.
            logger.info("Attempting to save state before shutdown...")
            state_manager.close()
-            logger.info("Shutdown requested. Allowing in-progress tasks to complete. No new tasks will be started. Press Ctrl+C again to force exit.")
+            logger.info("Shutdown requested. Signalling in-progress tasks to terminate gracefully. No new tasks will be started. Press Ctrl+C again to force exit.")
        else:
            logger.info("Second signal received, forcing exit.")
            # On second signal, forcefully terminate subprocesses.
@ -579,6 +631,7 @@ def main_stress_policy(args):
            logger.info(f"Starting/resuming from URL index {start_index + 1}.")
            # The worker's get_next_url_batch will respect this starting index.
        logger.info(f"Task source file: {os.path.abspath(urls_file)}")
        sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
        if args.dry_run: return 0
@ -586,13 +639,8 @@ def main_stress_policy(args):
        worker_pools = exec_control.get('worker_pools', [])
        discovery_config = exec_control.get('worker_pool_discovery')
-        if discovery_config:
+        if discovery_config and not worker_pools:
-            if worker_pools:
+            logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
            discovery_pattern = discovery_config.get('profile_prefix_pattern')
            workers_per_group = discovery_config.get('workers_per_profile_group', 1)
            direct_policy = policy.get('direct_batch_cli_policy', {})
            use_env = direct_policy.get('use_profile_env', 'auth')
            manager_for_discovery = profile_managers.get(use_env)
@ -601,35 +649,10 @@ def main_stress_policy(args):
                logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                return 1
-            if not discovery_pattern:
+            discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+            if discovered_pools is None:
-                return 1
+                return 1 # An error occurred during discovery
-
+            worker_pools = discovered_pools
            logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
            try:
                all_profiles = manager_for_discovery.list_profiles()
                found_prefixes = set()
                for profile in all_profiles:
                    profile_name = profile['name']
                    if fnmatch.fnmatch(profile_name, discovery_pattern):
                        # Assuming standard name format like 'user31_001', extract 'user31'
                        prefix = profile_name.rsplit('_', 1)[0]
                        found_prefixes.add(prefix)
                if not found_prefixes:
                    logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                    worker_pools = []
                else:
                    worker_pools = []
                    for prefix in sorted(list(found_prefixes)):
                        worker_pools.append({
                            'profile_prefix': prefix,
                            'workers': workers_per_group
                        })
                    logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
            except Exception as e:
                logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                return 1
        if not worker_pools and exec_control.get('workers'):
            # Fallback for legacy 'workers: N' config
@ -641,24 +664,60 @@ def main_stress_policy(args):
            return 1
        if args.profile_prefix:
-            logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
+            logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
            cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
            original_pool_count = len(worker_pools)
            filtered_pools = []
            for pool in worker_pools:
-                pool['profile_prefix'] = args.profile_prefix
+                pool_prefixes_str = pool.get('profile_prefix', '')
                if not pool_prefixes_str:
                    continue
                pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
                if pool_prefixes.intersection(cli_prefixes):
                    filtered_pools.append(pool)
            if len(filtered_pools) < original_pool_count:
                logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
            worker_pools = filtered_pools
            if not worker_pools:
                logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
        worker_specs = []
        if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
            logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
            all_prefixes = []
            for pool in worker_pools:
                prefix_str = pool.get('profile_prefix', '')
                if prefix_str:
                    all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
            final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
            logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
            worker_policy = deepcopy(policy)
            worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str
            worker_specs.append({
                'func': run_direct_batch_worker,
                'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
            })
        else:
            worker_id_counter = 0
            for pool in worker_pools:
                if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                    break
                pool_workers = pool.get('workers', 1)
                prefix_str = pool.get('profile_prefix', '')
-            prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
+                if not prefix_str:
            if not prefixes:
                    logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
                    continue
                for i in range(pool_workers):
-                assigned_prefix = prefixes[i % len(prefixes)]
+                    if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                        break
                    worker_policy = deepcopy(policy)
-                worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix
+                    worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
                    worker_specs.append({
                        'func': run_direct_batch_worker,
                        'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
@ -742,6 +801,7 @@ def main_stress_policy(args):
            if start_index > 0:
                logger.info(f"Starting/resuming from URL index {start_index + 1}.")
            logger.info(f"Task source file: {os.path.abspath(urls_file)}")
            sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
            if args.dry_run: return 0
@ -749,13 +809,8 @@ def main_stress_policy(args):
            worker_pools = exec_control.get('worker_pools', [])
            discovery_config = exec_control.get('worker_pool_discovery')
-            if discovery_config:
+            if discovery_config and not worker_pools:
-                if worker_pools:
+                logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                    logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
                discovery_pattern = discovery_config.get('profile_prefix_pattern')
                workers_per_group = discovery_config.get('workers_per_profile_group', 1)
                direct_policy = policy.get('direct_docker_cli_policy', {})
                use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download')
                manager_for_discovery = profile_managers.get(use_env)
@ -764,35 +819,10 @@ def main_stress_policy(args):
                    logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                    return 1
-                if not discovery_pattern:
+                discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                    logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+                if discovered_pools is None:
-                    return 1
+                    return 1 # An error occurred
-
+                worker_pools = discovered_pools
                logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
                try:
                    all_profiles = manager_for_discovery.list_profiles()
                    found_prefixes = set()
                    for profile in all_profiles:
                        profile_name = profile['name']
                        if fnmatch.fnmatch(profile_name, discovery_pattern):
                            # Assuming standard name format like 'user31_001', extract 'user31'
                            prefix = profile_name.rsplit('_', 1)[0]
                            found_prefixes.add(prefix)
                    if not found_prefixes:
                        logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                        worker_pools = []
                    else:
                        worker_pools = []
                        for prefix in sorted(list(found_prefixes)):
                            worker_pools.append({
                                'profile_prefix': prefix,
                                'workers': workers_per_group
                            })
                        logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
                except Exception as e:
                    logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                    return 1
            if not worker_pools and exec_control.get('workers'):
                # Fallback for legacy 'workers: N' config
@ -804,22 +834,59 @@ def main_stress_policy(args):
                return 1
            if args.profile_prefix:
-                logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
+                logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
                cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
                original_pool_count = len(worker_pools)
                filtered_pools = []
                for pool in worker_pools:
-                    pool['profile_prefix'] = args.profile_prefix
+                    pool_prefixes_str = pool.get('profile_prefix', '')
                    if not pool_prefixes_str:
                        continue
                    pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
                    if pool_prefixes.intersection(cli_prefixes):
                        filtered_pools.append(pool)
                if len(filtered_pools) < original_pool_count:
                    logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
                worker_pools = filtered_pools
                if not worker_pools:
                    logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
            worker_specs = []
            if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
                logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
                all_prefixes = []
                for pool in worker_pools:
                    prefix_str = pool.get('profile_prefix', '')
                    if prefix_str:
                        all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
                final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
                logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
                worker_policy = deepcopy(policy)
                worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str
                worker_specs.append({
                    'func': run_direct_docker_worker,
                    'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
                })
            else:
                worker_id_counter = 0
                for pool in worker_pools:
                    if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                        break
                    pool_workers = pool.get('workers', 1)
                    prefix_str = pool.get('profile_prefix', '')
-                prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
+                    if not prefix_str:
                if not prefixes:
                        logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
                        continue
                    # Each worker in the pool gets the full list of prefixes from the pool configuration.
                    for i in range(pool_workers):
                        if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                            break
                        worker_policy = deepcopy(policy)
                        # The worker functions will now handle a comma-separated list of prefixes.
                        worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
@ -847,6 +914,7 @@ def main_stress_policy(args):
                logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
                return 1
            logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}")
            sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
            if args.dry_run: return 0
@ -854,13 +922,8 @@ def main_stress_policy(args):
            worker_pools = exec_control.get('worker_pools', [])
            discovery_config = exec_control.get('worker_pool_discovery')
-            if discovery_config:
+            if discovery_config and not worker_pools:
-                if worker_pools:
+                logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                    logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
                discovery_pattern = discovery_config.get('profile_prefix_pattern')
                workers_per_group = discovery_config.get('workers_per_profile_group', 1)
                direct_policy = policy.get('direct_docker_cli_policy', {})
                use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download')
                manager_for_discovery = profile_managers.get(use_env)
@ -869,35 +932,10 @@ def main_stress_policy(args):
                    logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                    return 1
-                if not discovery_pattern:
+                discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                    logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+                if discovered_pools is None:
-                    return 1
+                    return 1 # An error occurred
-
+                worker_pools = discovered_pools
                logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
                try:
                    all_profiles = manager_for_discovery.list_profiles()
                    found_prefixes = set()
                    for profile in all_profiles:
                        profile_name = profile['name']
                        if fnmatch.fnmatch(profile_name, discovery_pattern):
                            # Assuming standard name format like 'user31_001', extract 'user31'
                            prefix = profile_name.rsplit('_', 1)[0]
                            found_prefixes.add(prefix)
                    if not found_prefixes:
                        logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                        worker_pools = []
                    else:
                        worker_pools = []
                        for prefix in sorted(list(found_prefixes)):
                            worker_pools.append({
                                'profile_prefix': prefix,
                                'workers': workers_per_group
                            })
                        logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
                except Exception as e:
                    logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                    return 1
            if not worker_pools and exec_control.get('workers'):
                # Fallback for legacy 'workers: N' config
@ -909,22 +947,59 @@ def main_stress_policy(args):
                return 1
            if args.profile_prefix:
-                logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
+                logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
                cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
                original_pool_count = len(worker_pools)
                filtered_pools = []
                for pool in worker_pools:
-                    pool['profile_prefix'] = args.profile_prefix
+                    pool_prefixes_str = pool.get('profile_prefix', '')
                    if not pool_prefixes_str:
                        continue
                    pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
                    if pool_prefixes.intersection(cli_prefixes):
                        filtered_pools.append(pool)
                if len(filtered_pools) < original_pool_count:
                    logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
                worker_pools = filtered_pools
                if not worker_pools:
                    logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
            worker_specs = []
            if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
                logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
                all_prefixes = []
                for pool in worker_pools:
                    prefix_str = pool.get('profile_prefix', '')
                    if prefix_str:
                        all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
                final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
                logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
                worker_policy = deepcopy(policy)
                worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str
                worker_specs.append({
                    'func': run_direct_docker_download_worker,
                    'args': (0, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock)
                })
            else:
                worker_id_counter = 0
                for pool in worker_pools:
                    if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                        break
                    pool_workers = pool.get('workers', 1)
                    prefix_str = pool.get('profile_prefix', '')
-                prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
+                    if not prefix_str:
                if not prefixes:
                        logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
                        continue
                    # Each worker in the pool gets the full list of prefixes from the pool configuration.
                    for i in range(pool_workers):
                        if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                            break
                        worker_policy = deepcopy(policy)
                        # The worker functions will now handle a comma-separated list of prefixes.
                        worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
@ -968,6 +1043,7 @@ def main_stress_policy(args):
            logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
            return 1
        logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}")
        sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
        if args.dry_run: return 0
@ -975,48 +1051,18 @@ def main_stress_policy(args):
        worker_pools = exec_control.get('worker_pools', [])
        discovery_config = exec_control.get('worker_pool_discovery')
-        if discovery_config:
+        if discovery_config and not worker_pools:
-            if worker_pools:
+            logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
            discovery_pattern = discovery_config.get('profile_prefix_pattern')
            workers_per_group = discovery_config.get('workers_per_profile_group', 1)
            manager_for_discovery = profile_managers.get('download')
            if not manager_for_discovery:
                logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                return 1
-            if not discovery_pattern:
+            discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+            if discovered_pools is None:
-                return 1
+                return 1 # An error occurred
-
+            worker_pools = discovered_pools
            logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
            try:
                all_profiles = manager_for_discovery.list_profiles()
                found_prefixes = set()
                for profile in all_profiles:
                    profile_name = profile['name']
                    if fnmatch.fnmatch(profile_name, discovery_pattern):
                        # Assuming standard name format like 'user31_001', extract 'user31'
                        prefix = profile_name.rsplit('_', 1)[0]
                        found_prefixes.add(prefix)
                if not found_prefixes:
                    logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                    worker_pools = []
                else:
                    worker_pools = []
                    for prefix in sorted(list(found_prefixes)):
                        worker_pools.append({
                            'profile_prefix': prefix,
                            'workers': workers_per_group
                        })
                    logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
            except Exception as e:
                logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                return 1
        if not worker_pools and exec_control.get('workers'):
            # Fallback for legacy 'workers: N' config
@ -1028,22 +1074,59 @@ def main_stress_policy(args):
            return 1
        if args.profile_prefix:
-            logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
+            logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
            cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
            original_pool_count = len(worker_pools)
            filtered_pools = []
            for pool in worker_pools:
-                pool['profile_prefix'] = args.profile_prefix
+                pool_prefixes_str = pool.get('profile_prefix', '')
                if not pool_prefixes_str:
                    continue
                pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
                if pool_prefixes.intersection(cli_prefixes):
                    filtered_pools.append(pool)
            if len(filtered_pools) < original_pool_count:
                logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
            worker_pools = filtered_pools
            if not worker_pools:
                logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
        worker_specs = []
        if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
            logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
            all_prefixes = []
            for pool in worker_pools:
                prefix_str = pool.get('profile_prefix', '')
                if prefix_str:
                    all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
            final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
            logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
            worker_policy = deepcopy(policy)
            worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str
            worker_specs.append({
                'func': run_direct_download_worker,
                'args': (0, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
            })
        else:
            worker_id_counter = 0
            for pool in worker_pools:
                if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                    break
                pool_workers = pool.get('workers', 1)
                prefix_str = pool.get('profile_prefix', '')
-            prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
+                if not prefix_str:
            if not prefixes:
                    logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
                    continue
                # Each worker in the pool gets the full list of prefixes from the pool configuration.
                for i in range(pool_workers):
                    if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                        break
                    worker_policy = deepcopy(policy)
                    # The worker functions will now handle a comma-separated list of prefixes.
                    worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
@ -1129,48 +1212,18 @@ def main_stress_policy(args):
        worker_pools = exec_control.get('worker_pools', [])
        discovery_config = exec_control.get('worker_pool_discovery')
-        if discovery_config:
+        if discovery_config and not worker_pools:
-            if worker_pools:
+            logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
            discovery_pattern = discovery_config.get('profile_prefix_pattern')
            workers_per_group = discovery_config.get('workers_per_profile_group', 1)
            manager_for_discovery = profile_managers.get('auth')
            if not manager_for_discovery:
                logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                return 1
-            if not discovery_pattern:
+            discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+            if discovered_pools is None:
-                return 1
+                return 1 # An error occurred
-
+            worker_pools = discovered_pools
            logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
            try:
                all_profiles = manager_for_discovery.list_profiles()
                found_prefixes = set()
                for profile in all_profiles:
                    profile_name = profile['name']
                    if fnmatch.fnmatch(profile_name, discovery_pattern):
                        # Assuming standard name format like 'user31_001', extract 'user31'
                        prefix = profile_name.rsplit('_', 1)[0]
                        found_prefixes.add(prefix)
                if not found_prefixes:
                    logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                    worker_pools = []
                else:
                    worker_pools = []
                    for prefix in sorted(list(found_prefixes)):
                        worker_pools.append({
                            'profile_prefix': prefix,
                            'workers': workers_per_group
                        })
                    logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
            except Exception as e:
                logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                return 1
        if not worker_pools and exec_control.get('workers'):
            # Fallback for legacy 'workers: N' config
@ -1189,6 +1242,8 @@ def main_stress_policy(args):
        worker_specs = []
        worker_id_counter = 0
        for pool in worker_pools:
            if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                break
            pool_workers = pool.get('workers', 1)
            prefix_str = pool.get('profile_prefix', '')
            prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
@ -1198,6 +1253,8 @@ def main_stress_policy(args):
            # Each worker in the pool gets the full list of prefixes from the pool configuration.
            for i in range(pool_workers):
                if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                    break
                worker_policy = deepcopy(policy)
                # The worker functions will now handle a comma-separated list of prefixes.
                worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
@ -1285,48 +1342,18 @@ def main_stress_policy(args):
        worker_pools = exec_control.get('worker_pools', [])
        discovery_config = exec_control.get('worker_pool_discovery')
-        if discovery_config:
+        if discovery_config and not worker_pools:
-            if worker_pools:
+            logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
                logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
            discovery_pattern = discovery_config.get('profile_prefix_pattern')
            workers_per_group = discovery_config.get('workers_per_profile_group', 1)
            manager_for_discovery = profile_managers.get('download')
            if not manager_for_discovery:
                logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
                return 1
-            if not discovery_pattern:
+            discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
-                logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
+            if discovered_pools is None:
-                return 1
+                return 1 # An error occurred
-
+            worker_pools = discovered_pools
            logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
            try:
                all_profiles = manager_for_discovery.list_profiles()
                found_prefixes = set()
                for profile in all_profiles:
                    profile_name = profile['name']
                    if fnmatch.fnmatch(profile_name, discovery_pattern):
                        # Assuming standard name format like 'user31_001', extract 'user31'
                        prefix = profile_name.rsplit('_', 1)[0]
                        found_prefixes.add(prefix)
                if not found_prefixes:
                    logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
                    worker_pools = []
                else:
                    worker_pools = []
                    for prefix in sorted(list(found_prefixes)):
                        worker_pools.append({
                            'profile_prefix': prefix,
                            'workers': workers_per_group
                        })
                    logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
            except Exception as e:
                logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
                return 1
        if not worker_pools and exec_control.get('workers'):
            # Fallback for legacy 'workers: N' config
@ -1345,6 +1372,8 @@ def main_stress_policy(args):
        worker_specs = []
        worker_id_counter = 0
        for pool in worker_pools:
            if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                break
            pool_workers = pool.get('workers', 1)
            prefix_str = pool.get('profile_prefix', '')
            prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
@ -1354,6 +1383,8 @@ def main_stress_policy(args):
            # Each worker in the pool gets the full list of prefixes from the pool configuration.
            for i in range(pool_workers):
                if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
                    break
                worker_policy = deepcopy(policy)
                # The worker functions will now handle a comma-separated list of prefixes.
                worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str