diff --git a/ansible/playbook-README.md b/ansible/playbook-README.md index 4567cf4..506bc69 100644 --- a/ansible/playbook-README.md +++ b/ansible/playbook-README.md @@ -108,12 +108,20 @@ ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.gree ### Profile Management +The `cleanup-profiles` action can be used to remove profiles from Redis. By default, it cleans up "ungrouped" profiles. + ```bash -# Clean up all profiles +# Perform a dry run of cleaning up ungrouped profiles +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "dry_run=true" + +# Clean up ungrouped profiles ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -# Clean up specific profile prefix -ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1" +# To clean up ALL profiles (destructive), set cleanup_mode=full +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "cleanup_mode=full" + +# You can specify a custom setup policy file for cleanup operations +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "setup_policy=policies/my_custom_setup_policy.yaml" ``` ## Monitoring and Inspection @@ -157,7 +165,19 @@ Then, from the jump host, you can sync code or policies to the cluster nodes: ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini # Sync only policies and CLI configs -ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini +ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini + +# To sync files from a custom source directory on the Ansible controller, use the 'source_base_dir' extra variable: +ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini -e "source_base_dir=/path/to/my-custom-source" +``` + +### Docker Image Updates + +To update the `yt-dlp` docker image used by download simulators, run the following playbook. This builds the image locally on each worker node. + +```bash +# Build the yt-dlp docker image locally on each worker node +ansible-playbook ansible/playbook-update-yt-dlp-docker.yml -i ansible/inventory.green.ini ``` ### Adding a New Worker @@ -225,7 +245,10 @@ ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green. ### Restart Enforcer and Monitoring ```bash -# Restart monitoring and enforcer on master +# Restart monitoring and enforcer on master using default policies ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" + +# Restart using a custom enforcer policy +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" -e "enforcer_policy=policies/my_other_enforcer.yaml" ``` diff --git a/ansible/playbook-full-install.yml b/ansible/playbook-full-install.yml index f6d33d5..d42c326 100644 --- a/ansible/playbook-full-install.yml +++ b/ansible/playbook-full-install.yml @@ -1,4 +1,35 @@ --- +# ------------------------------------------------------------------------------------------------- +# PHASE 0: Fix Python Dependencies +# Ensures remote hosts have compatible Python libraries to prevent module failures. +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 0: Upgrade Python SSL libraries" + hosts: all + gather_facts: no + tasks: + - name: Attempt to upgrade pyOpenSSL, cryptography and urllib3 + ansible.builtin.shell: + cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3" + become: yes + register: pip_upgrade_result + changed_when: "'Successfully installed' in pip_upgrade_result.stdout" + failed_when: false + + - name: Retry upgrade with --break-system-packages on specific error + ansible.builtin.shell: + cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3 --break-system-packages" + become: yes + register: pip_upgrade_result_retry + changed_when: "'Successfully installed' in pip_upgrade_result_retry.stdout" + when: pip_upgrade_result.rc != 0 and 'externally-managed-environment' in pip_upgrade_result.stderr + + - name: Fail if package upgrade did not succeed + ansible.builtin.fail: + msg: "Failed to upgrade Python packages after retry. Last error: {{ pip_upgrade_result_retry.stderr | default(pip_upgrade_result.stderr) }}" + when: > + pip_upgrade_result.rc != 0 and + (pip_upgrade_result_retry is not defined or pip_upgrade_result_retry.rc != 0) + # This playbook provides a complete installation for fresh nodes. # It can install either master or worker roles, or both on the same machine. # @@ -55,57 +86,10 @@ # ------------------------------------------------------------------------------------------------- # PHASE 4: Build yt-dlp Docker Image -# Builds the yt-dlp container from bin/ directory +# Builds the yt-dlp container on each worker node using the dedicated playbook. # ------------------------------------------------------------------------------------------------- -- name: "PHASE 4: Build yt-dlp Docker image" - hosts: all - gather_facts: no - vars_files: - - "group_vars/all/vault.yml" - pre_tasks: - - name: Set inventory_env fact - ansible.builtin.set_fact: - inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" - - name: Load environment-specific variables - ansible.builtin.include_vars: "{{ item }}" - with_fileglob: - - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" - tasks: - - name: Define base directory for node - ansible.builtin.set_fact: - base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}" - - - name: Ensure bin directory exists - ansible.builtin.file: - path: "{{ base_dir }}/bin" - state: directory - owner: "{{ ansible_user }}" - group: "{{ deploy_group }}" - mode: '0755' - become: yes - - - name: Check if Dockerfile exists in bin directory - ansible.builtin.stat: - path: "{{ base_dir }}/bin/Dockerfile" - register: dockerfile_stat - - - name: Build yt-dlp Docker image if Dockerfile exists - community.docker.docker_image: - name: yt-dlp-custom - tag: latest - source: build - build: - path: "{{ base_dir }}/bin" - pull: yes - state: present - force_source: yes - become: yes - when: dockerfile_stat.stat.exists - - - name: Display message if Dockerfile not found - ansible.builtin.debug: - msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build" - when: not dockerfile_stat.stat.exists +- name: "PHASE 4: Build yt-dlp Docker image on workers" + import_playbook: playbook-update-yt-dlp-docker.yml # ------------------------------------------------------------------------------------------------- # PHASE 5: Sync Code and Install Dependencies @@ -148,6 +132,12 @@ with_fileglob: - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: + - name: Install redis-tools + ansible.builtin.apt: + name: redis-tools + state: present + become: yes + - name: Configure system performance and kernel settings ansible.builtin.copy: src: "configs/etc/sysctl.d/99-system-limits.conf" @@ -174,7 +164,7 @@ - name: Template Docker Compose file for master services ansible.builtin.template: - src: templates/docker-compose.stress-master.j2 + src: docker-compose.stress-master.j2 dest: "{{ airflow_master_dir }}/docker-compose.stress.yml" owner: "{{ ansible_user }}" group: "{{ deploy_group }}" @@ -184,9 +174,7 @@ - name: Stop and remove existing containers before starting services ansible.builtin.shell: cmd: | - docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f - docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f - docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f become: yes changed_when: false ignore_errors: yes @@ -200,6 +188,14 @@ remove_orphans: true become: yes + - name: Wait for Redis service to be ready + ansible.builtin.wait_for: + host: localhost + port: "{{ redis_port }}" + delay: 5 + timeout: 60 + delegate_to: "{{ inventory_hostname }}" + - name: Wait for MinIO service to be ready ansible.builtin.wait_for: host: "{{ hostvars[inventory_hostname].ansible_host }}" @@ -240,7 +236,7 @@ register: mc_mb_result failed_when: > mc_mb_result.rc != 0 and - "already exists" not in mc_mb_result.stderr + "already own it" not in mc_mb_result.stderr changed_when: mc_mb_result.rc == 0 environment: HOME: "/home/{{ ansible_user }}" @@ -264,7 +260,7 @@ tasks: - name: Template Docker Compose file for worker services ansible.builtin.template: - src: templates/docker-compose.stress-master.j2 + src: docker-compose.stress-master.j2 dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml" owner: "{{ ansible_user }}" group: "{{ deploy_group }}" @@ -274,9 +270,9 @@ - name: Stop and remove existing containers before starting services ansible.builtin.shell: cmd: | - docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f - docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f - docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f + docker ps -a --filter "name=redis-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f + docker ps -a --filter "name=minio-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f become: yes changed_when: false ignore_errors: yes diff --git a/ansible/playbook-stress-cleanup-info-jsons.yml b/ansible/playbook-stress-cleanup-info-jsons.yml new file mode 100644 index 0000000..52fb599 --- /dev/null +++ b/ansible/playbook-stress-cleanup-info-jsons.yml @@ -0,0 +1,41 @@ +--- +- name: "STRESS-CLEANUP: Remove info.json task files from workers" + hosts: workers + gather_facts: no + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define the directory to be cleaned + ansible.builtin.set_fact: + target_dir: "{{ airflow_worker_dir }}/run/docker_mount/info_json_tasks/direct_docker_simulation" + + - name: "Display directory being cleaned" + ansible.builtin.debug: + msg: "Cleaning directory: {{ target_dir }} on {{ inventory_hostname }}" + + - name: Remove the info_json_tasks directory + ansible.builtin.file: + path: "{{ target_dir }}" + state: absent + become: yes + + - name: Recreate the info_json_tasks directory + ansible.builtin.file: + path: "{{ target_dir }}" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: "Display cleanup completion" + ansible.builtin.debug: + msg: "Successfully cleaned and recreated {{ target_dir }} on {{ inventory_hostname }}" diff --git a/ansible/playbook-stress-control.yml b/ansible/playbook-stress-control.yml index 0bdfb9f..a548c88 100644 --- a/ansible/playbook-stress-control.yml +++ b/ansible/playbook-stress-control.yml @@ -1,10 +1,11 @@ --- - name: "STRESS-SETUP: Unified control for stress test processes" - hosts: all + hosts: "{{ 'master' if action in master_only_actions else 'all' }}" gather_facts: no vars: # Default action is status check action: "status" + graceful_shutdown_timeout_seconds: 30 setup_policy: "policies/6_profile_setup_policy.yaml" enforcer_policy: "policies/8_unified_simulation_enforcer.yaml" master_only_actions: @@ -80,9 +81,7 @@ if [ -f .env ]; then set -a && . ./.env && set +a fi - timeout 10 ./bin/ytops-client profile list \ - --auth-env sim_auth \ - --download-env sim_download 2>&1 + timeout 10 ./bin/ytops-client profile list 2>&1 register: profile_list_output changed_when: false when: @@ -132,7 +131,12 @@ - name: "Display policy being used for profile cleanup" ansible.builtin.debug: - msg: "Using setup policy for cleanup: {{ setup_policy }}" + msg: >- + {% if cleanup_mode | default('ungrouped') == 'full' %} + Performing a FULL cleanup of all profiles using policy: {{ setup_policy }} + {% else %} + Cleaning up UNGROUPED profiles using policy: {{ setup_policy }}{% if dry_run | default(false) %} (DRY RUN){% endif %} + {% endif %} when: - action == "cleanup-profiles" - inventory_hostname in groups['master'] @@ -146,9 +150,15 @@ if [ -f .env ]; then set -a && . ./.env && set +a fi + {% if cleanup_mode | default('ungrouped') == 'full' %} ./bin/ytops-client setup-profiles \ --policy {{ setup_policy }} \ - --cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %} + --cleanup-all + {% else %} + ./bin/ytops-client profile cleanup-ungrouped \ + --policy-file {{ setup_policy }} \ + {% if dry_run | default(false) %}--dry-run{% endif %} + {% endif %} register: cleanup_output changed_when: false when: @@ -172,8 +182,34 @@ - cleanup_output is defined - name: Stop all stress test processes on all nodes (stop-all action) + vars: + process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops" block: - - name: Kill all tmux sessions starting with 'stress-' + - name: "Get PIDs of running stress test processes" + ansible.builtin.shell: + cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'" + register: pids_to_kill + changed_when: false + + - name: "Gracefully terminate stress test processes" + ansible.builtin.shell: + cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}" + when: pids_to_kill.stdout | length > 0 + ignore_errors: true + changed_when: false + + - name: "Wait for graceful shutdown" + ansible.builtin.pause: + seconds: "{{ graceful_shutdown_timeout_seconds }}" + when: pids_to_kill.stdout | length > 0 + + - name: "Force kill any lingering stress test processes" + ansible.builtin.shell: + cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true" + ignore_errors: true + changed_when: false + + - name: "Kill all stress-related tmux sessions as a failsafe" ansible.builtin.shell: cmd: | for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do @@ -186,27 +222,37 @@ fi ignore_errors: yes changed_when: false - - - name: Kill all ytops-client and related python processes - ansible.builtin.shell: - cmd: | - # Gracefully terminate processes by pattern - ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - - sleep 1 # Wait for graceful shutdown - - # Force kill any remaining processes - ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true - ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true - ignore_errors: yes - changed_when: false - when: action == "stop-all" - name: Stop processes on targeted nodes only (stop-nodes action) + vars: + process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops" block: - - name: Kill all tmux sessions starting with 'stress-' on this node + - name: "Get PIDs of running stress test processes" + ansible.builtin.shell: + cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'" + register: pids_to_kill + changed_when: false + + - name: "Gracefully terminate stress test processes" + ansible.builtin.shell: + cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}" + when: pids_to_kill.stdout | length > 0 + ignore_errors: true + changed_when: false + + - name: "Wait for graceful shutdown" + ansible.builtin.pause: + seconds: "{{ graceful_shutdown_timeout_seconds }}" + when: pids_to_kill.stdout | length > 0 + + - name: "Force kill any lingering stress test processes" + ansible.builtin.shell: + cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true" + ignore_errors: true + changed_when: false + + - name: "Kill all stress-related tmux sessions as a failsafe" ansible.builtin.shell: cmd: | for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do @@ -219,22 +265,6 @@ fi ignore_errors: yes changed_when: false - - - name: Kill all ytops-client and related python processes on this node - ansible.builtin.shell: - cmd: | - # Gracefully terminate processes by pattern - ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - - sleep 1 # Wait for graceful shutdown - - # Force kill any remaining processes - ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true - ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true - ignore_errors: yes - changed_when: false - when: action == "stop-nodes" - name: Restart monitoring and enforcer (restart-monitoring action) @@ -263,8 +293,6 @@ working_dir: "{{ airflow_master_dir }}" command_to_run: > ./bin/ytops-client profile list - --auth-env sim_auth - --download-env sim_download --live --no-blink --show-reasons diff --git a/ansible/playbook-stress-download-simulation.yml b/ansible/playbook-stress-download-simulation.yml index 19134aa..22333be 100644 --- a/ansible/playbook-stress-download-simulation.yml +++ b/ansible/playbook-stress-download-simulation.yml @@ -3,7 +3,7 @@ hosts: workers gather_facts: no vars: - tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}" + tmux_session_download: "{{ 'stress-download-worker-' + (worker_num | string) if (profile_prefix | default('') == 'worker' and worker_num is defined) else 'stress-download-' + (profile_prefix | default('default')) | replace(',', '-') }}" download_policy: "policies/11_direct_docker_download_simulation.yaml" vars_files: - "group_vars/all/vault.yml" @@ -41,13 +41,14 @@ command_to_run: > ./bin/ytops-client stress-policy --policy {{ download_policy }} - {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} + {# {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} #} {% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %} {% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %} - {% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %} + {# --set 'execution_control.worker_pools=[{"profile_prefix": "{% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}", "workers": 1}]' #} {% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %} - --profile-prefix {{ profile_prefix }} - process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}" + {% if profile_prefix == 'worker' %}--workers 1{% endif %} + --profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %} + process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}{% if profile_prefix == 'worker' %}.*--workers 1{% endif %}.*--profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}" start_process: "{{ start_download | default(false) | bool }}" stop_process: "{{ stop_download | default(false) | bool }}" check_status: "{{ vars.check_status | default(false) | bool }}" diff --git a/ansible/playbook-stress-init-redis.yml b/ansible/playbook-stress-init-redis.yml index e4e90f3..e30a7c8 100644 --- a/ansible/playbook-stress-init-redis.yml +++ b/ansible/playbook-stress-init-redis.yml @@ -15,16 +15,28 @@ with_fileglob: - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - - name: Check if Redis is running + - name: Wait for Redis to be available + ansible.builtin.wait_for: + host: localhost + port: "{{ redis_port }}" + timeout: 60 + delay: 5 + register: redis_port_check + ignore_errors: yes + + - name: Check if Redis is running and responding to commands ansible.builtin.shell: - cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG" + cmd: "redis-cli -h localhost -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG" register: redis_check ignore_errors: yes changed_when: false + retries: 3 + delay: 5 + until: redis_check.rc == 0 - name: Ensure Redis is accessible ansible.builtin.fail: - msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}" + msg: "Redis is not accessible on master node. Please ensure Redis service is running on localhost:{{ redis_port }}" when: redis_check.rc != 0 - name: Stop any running ytops-client processes on master @@ -44,7 +56,7 @@ --policy {{ setup_policy }} \ --cleanup-all environment: - REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}" + REDIS_HOST: "localhost" REDIS_PORT: "{{ redis_port }}" REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}" register: init_result diff --git a/ansible/playbook-stress-install-deps.yml b/ansible/playbook-stress-install-deps.yml index 663a421..762234e 100644 --- a/ansible/playbook-stress-install-deps.yml +++ b/ansible/playbook-stress-install-deps.yml @@ -37,18 +37,52 @@ ansible.builtin.set_fact: base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" - - name: Install required Python packages from requirements.txt + - name: Attempt to install required Python packages from requirements.txt + ansible.builtin.pip: + requirements: "{{ base_dir }}/ytops_client/requirements.txt" + extra_args: "--ignore-installed" + become: yes + register: pip_reqs_result + failed_when: false + + - name: Retry installing requirements with break-system-packages ansible.builtin.pip: requirements: "{{ base_dir }}/ytops_client/requirements.txt" extra_args: "--ignore-installed" become: yes environment: PIP_BREAK_SYSTEM_PACKAGES: "1" + register: pip_reqs_result_retry + when: pip_reqs_result.failed and 'externally-managed-environment' in pip_reqs_result.msg - - name: Explicitly install the thrift package + - name: Fail if requirements installation did not succeed + ansible.builtin.fail: + msg: "Failed to install requirements after retry. Last error: {{ pip_reqs_result_retry.msg | default(pip_reqs_result.msg) }}" + when: > + pip_reqs_result.failed and + (pip_reqs_result_retry is not defined or pip_reqs_result_retry.failed) + + - name: Attempt to explicitly install the thrift package + ansible.builtin.pip: + name: thrift + extra_args: "--ignore-installed" + become: yes + register: pip_thrift_result + failed_when: false + + - name: Retry installing thrift with break-system-packages ansible.builtin.pip: name: thrift extra_args: "--ignore-installed" become: yes environment: PIP_BREAK_SYSTEM_PACKAGES: "1" + register: pip_thrift_result_retry + when: pip_thrift_result.failed and 'externally-managed-environment' in pip_thrift_result.msg + + - name: Fail if thrift installation did not succeed + ansible.builtin.fail: + msg: "Failed to install thrift after retry. Last error: {{ pip_thrift_result_retry.msg | default(pip_thrift_result.msg) }}" + when: > + pip_thrift_result.failed and + (pip_thrift_result_retry is not defined or pip_thrift_result_retry.failed) diff --git a/ansible/playbook-stress-lifecycle.yml b/ansible/playbook-stress-lifecycle.yml index e798e5b..bd3c169 100644 --- a/ansible/playbook-stress-lifecycle.yml +++ b/ansible/playbook-stress-lifecycle.yml @@ -4,9 +4,15 @@ gather_facts: no vars: # Default action - action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download, stop-generator + action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download + graceful_shutdown_timeout_seconds: 30 tasks: + - name: "Ensure profile_prefixes is a flat list of all prefixes from profile_pools" + ansible.builtin.set_fact: + profile_prefixes: "{{ profile_pools | map(attribute='prefixes') | flatten }}" + when: profile_pools is defined + - name: "Start all configured generators and simulators" when: action == "start" block: @@ -25,7 +31,7 @@ --limit {{ inventory_hostname }} -e "start_generator=true" -e "profile_prefix={{ combined_prefixes }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + -e "dummy_batch={{ dummy_batch | default(false) }}" {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} @@ -42,7 +48,7 @@ --limit {{ inventory_hostname }} -e "start_generator=true" -e "profile_prefix={{ item }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + -e "dummy_batch={{ dummy_batch | default(false) }}" {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} @@ -56,42 +62,14 @@ label: "profile: {{ item }}" when: auth_workers_per_profile | default(0) | int > 0 - - name: "Start download simulator(s)" - when: profile_prefixes is defined and profile_prefixes | length > 0 - block: - - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}" - ansible.builtin.command: >- - ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml - -i {{ inventory_file }} - --limit {{ inventory_hostname }} - -e "start_download=true" - -e "profile_prefix={{ combined_prefixes }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} - {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} - {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} - {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} - delegate_to: localhost - changed_when: true - when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0) + - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug" + ansible.builtin.set_fact: + download_workers_total: "{{ auth_workers_total | default(0) }}" + download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}" - - name: "Start parallel download simulators for each profile" - ansible.builtin.command: >- - ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml - -i {{ inventory_file }} - --limit {{ inventory_hostname }} - -e "start_download=true" - -e "profile_prefix={{ item }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} - {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} - {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} - {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} - delegate_to: localhost - changed_when: true - loop: "{{ profile_prefixes }}" - loop_control: - loop_var: item - label: "profile: {{ item }}" - when: download_workers_per_profile | default(0) | int > 0 + - name: "Start download simulator(s)" + ansible.builtin.include_tasks: tasks/start-download-simulators.yml + when: profile_prefixes is defined and profile_prefixes | length > 0 - name: "Start only auth generators on workers" when: action == "start-auth" @@ -111,7 +89,7 @@ --limit {{ inventory_hostname }} -e "start_generator=true" -e "profile_prefix={{ combined_prefixes }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + -e "dummy_batch={{ dummy_batch | default(false) }}" {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} @@ -128,7 +106,7 @@ --limit {{ inventory_hostname }} -e "start_generator=true" -e "profile_prefix={{ item }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + -e "dummy_batch={{ dummy_batch | default(false) }}" {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} @@ -145,112 +123,90 @@ - name: "Start only download simulators on workers" when: action == "start-download" block: + - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug" + ansible.builtin.set_fact: + download_workers_total: "{{ auth_workers_total | default(0) }}" + download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}" + - name: "Set combined profile prefixes string" ansible.builtin.set_fact: combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" when: profile_prefixes is defined and profile_prefixes | length > 0 - name: "Start download simulator(s)" + ansible.builtin.include_tasks: tasks/start-download-simulators.yml when: profile_prefixes is defined and profile_prefixes | length > 0 - block: - - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}" - ansible.builtin.command: >- - ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml - -i {{ inventory_file }} - --limit {{ inventory_hostname }} - -e "start_download=true" - -e "profile_prefix={{ combined_prefixes }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} - {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} - {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} - {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} - delegate_to: localhost - changed_when: true - when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0) - - name: "Start parallel download simulators for each profile" - ansible.builtin.command: >- - ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml - -i {{ inventory_file }} - --limit {{ inventory_hostname }} - -e "start_download=true" - -e "profile_prefix={{ item }}" - {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} - {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} - {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} - {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} - delegate_to: localhost - changed_when: true - loop: "{{ profile_prefixes }}" - loop_control: - loop_var: item - label: "profile: {{ item }}" - when: download_workers_per_profile | default(0) | int > 0 - - name: "Stop only auth generators on workers (via playbook call)" - when: action == "stop-generator" + - name: "Stop only auth generators on workers" + when: action == "stop-auth" block: - name: "Set combined profile prefixes string" ansible.builtin.set_fact: combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" when: profile_prefixes is defined and profile_prefixes | length > 0 - - name: "Stop single auth generator for profiles: {{ combined_prefixes | default('none') }}" - ansible.builtin.command: >- - ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml - -i {{ inventory_file }} - --limit {{ inventory_hostname }} - -e "stop_generator=true" - -e "profile_prefix={{ combined_prefixes }}" - delegate_to: localhost - changed_when: true + - name: "Gracefully stop auth generator(s) via playbook call" when: profile_prefixes is defined and profile_prefixes | length > 0 + block: + - name: "Stop single auth generator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "stop_generator=true" + -e "profile_prefix={{ combined_prefixes }}" + delegate_to: localhost + changed_when: true + when: (auth_workers_per_profile | default(0) | int == 0) and (auth_workers_total | default(0) | int > 0) - - name: "Stop only auth generators on workers" - when: action == "stop-auth" - block: - - name: Kill all auth generator tmux sessions on this worker - ansible.builtin.shell: - cmd: | - for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-auth-"); do - tmux kill-session -t "$session" - done || true - ignore_errors: yes - changed_when: false - - - name: Kill all ytops-client auth generator processes on this worker - ansible.builtin.shell: - cmd: | - # Gracefully terminate - ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - sleep 0.5 - # Force kill - ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true - ignore_errors: yes - changed_when: false + - name: "Stop parallel auth generators for each profile" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "stop_generator=true" + -e "profile_prefix={{ item }}" + delegate_to: localhost + changed_when: true + loop: "{{ profile_prefixes }}" + loop_control: + loop_var: item + label: "profile: {{ item }}" + when: auth_workers_per_profile | default(0) | int > 0 - name: "Stop only download simulators on workers" when: action == "stop-download" block: - - name: Kill all download simulator tmux sessions on this worker + - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug" + ansible.builtin.set_fact: + download_workers_total: "{{ auth_workers_total | default(0) }}" + download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}" + + - name: "Set combined profile prefixes string" + ansible.builtin.set_fact: + combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" + when: profile_prefixes is defined and profile_prefixes | length > 0 + + - name: "Stop single download simulator group" ansible.builtin.shell: cmd: | - for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-download-"); do + for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$'); do tmux kill-session -t "$session" done || true + when: download_workers_total | default(0) | int > 0 + changed_when: true ignore_errors: yes - changed_when: false - - name: Kill all ytops-client download simulator processes on this worker - ansible.builtin.shell: - cmd: | - # Gracefully terminate - ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true - sleep 0.5 - # Force kill - ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + - name: "Stop parallel download simulators for each profile" + ansible.builtin.command: "tmux kill-session -t stress-download-{{ item }}" + loop: "{{ profile_prefixes }}" + loop_control: + loop_var: item + label: "profile: {{ item }}" + when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0) + changed_when: true ignore_errors: yes - changed_when: false - name: "Stop all worker generators and simulators" when: action == "stop" diff --git a/ansible/playbook-stress-sync-policies.yml b/ansible/playbook-stress-sync-policies.yml index 6ffa7d5..1dd942f 100644 --- a/ansible/playbook-stress-sync-policies.yml +++ b/ansible/playbook-stress-sync-policies.yml @@ -3,7 +3,7 @@ hosts: all gather_facts: no vars: - ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source" + ytops_source_dir: "{{ source_base_dir | default(playbook_dir + '/../ytops_client-source') }}" vars_files: - "group_vars/all/vault.yml" pre_tasks: diff --git a/ansible/playbook-update-yt-dlp-docker.yml b/ansible/playbook-update-yt-dlp-docker.yml new file mode 100644 index 0000000..e017786 --- /dev/null +++ b/ansible/playbook-update-yt-dlp-docker.yml @@ -0,0 +1,30 @@ +--- +- name: "STRESS-SETUP: Build and push yt-dlp docker image" + hosts: workers + gather_facts: no + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: "Build and push yt-dlp image on worker" + ansible.builtin.shell: + cmd: | + cd {{ airflow_worker_dir }} + if [ -f .env ]; then + set -a && . ./.env && set +a + fi + ./bin/build-yt-dlp-image + register: build_output + changed_when: true + + - name: "Display build output" + ansible.builtin.debug: + var: build_output.stdout_lines + when: build_output.stdout_lines is defined diff --git a/ansible/tasks/start-download-simulators.yml b/ansible/tasks/start-download-simulators.yml new file mode 100644 index 0000000..30b1f0a --- /dev/null +++ b/ansible/tasks/start-download-simulators.yml @@ -0,0 +1,105 @@ +--- +- name: "Scale down excess download workers if necessary" + when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0) + block: + - name: "Find running download simulator tmux sessions for this group" + ansible.builtin.shell: + cmd: "tmux list-sessions -F '#{session_name}' 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$' || true" + register: running_sessions + changed_when: false + ignore_errors: yes + + - name: "Identify excess download simulator sessions to stop" + ansible.builtin.set_fact: + excess_sessions_to_stop: "{{ excess_sessions_to_stop | default([]) + [item] }}" + vars: + worker_num_str: "{{ item | regex_replace('^stress-download-worker-', '') }}" + when: worker_num_str is number and (worker_num_str | int > (download_workers_total | int)) + loop: "{{ running_sessions.stdout_lines }}" + loop_control: + label: "Identifying excess session: {{ item }}" + + - name: "Get PIDs for excess download workers" + ansible.builtin.shell: + cmd: | + PANE_PID=$(tmux list-panes -s "{{ item }}" -F '#{pane_pid}' | head -n 1) + if [ -n "$PANE_PID" ]; then + pgrep -P "$PANE_PID" || true + fi + register: excess_pids_raw + loop: "{{ excess_sessions_to_stop | default([]) }}" + changed_when: false + ignore_errors: true + + - name: "Set fact for PIDs to kill" + ansible.builtin.set_fact: + pids_to_kill_gracefully: "{{ excess_pids_raw.results | map(attribute='stdout') | reject('==', '') | list }}" + + - name: "Gracefully terminate excess download workers" + ansible.builtin.shell: + cmd: "kill {{ item }}" + loop: "{{ pids_to_kill_gracefully | default([]) }}" + when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0 + ignore_errors: true + changed_when: false + + - name: "Wait for graceful shutdown of excess workers" + ansible.builtin.pause: + seconds: "{{ graceful_shutdown_timeout_seconds }}" + when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0 + + - name: "Force kill any lingering excess workers" + ansible.builtin.shell: + cmd: "kill -9 {{ item }}" + loop: "{{ pids_to_kill_gracefully | default([]) }}" + when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0 + ignore_errors: true + changed_when: false + + - name: "Kill tmux sessions for excess workers" + ansible.builtin.shell: + cmd: "tmux kill-session -t {{ item }}" + loop: "{{ excess_sessions_to_stop | default([]) }}" + when: excess_sessions_to_stop is defined and excess_sessions_to_stop | length > 0 + ignore_errors: true + changed_when: false + +- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "start_download=true" + -e "profile_prefix=worker" + -e "display_prefix={{ combined_prefixes }}" + -e "worker_num={{ worker_num }}" + {% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %} + {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} + {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} + {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} + delegate_to: localhost + changed_when: true + when: download_workers_total | default(0) | int > 0 + loop: "{{ range(1, (download_workers_total | default(1) | int) + 1) | list }}" + loop_control: + loop_var: worker_num + label: "worker {{ worker_num }}" + +- name: "Start parallel download simulators for each profile" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "start_download=true" + -e "profile_prefix={{ item }}" + {% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %} + {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} + {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} + {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} + delegate_to: localhost + changed_when: true + loop: "{{ profile_prefixes }}" + loop_control: + loop_var: item + label: "profile: {{ item }}" + when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0) diff --git a/ansible/templates/.env.stress.j2 b/ansible/templates/.env.stress.j2 index 7fa113e..fe4608c 100644 --- a/ansible/templates/.env.stress.j2 +++ b/ansible/templates/.env.stress.j2 @@ -15,3 +15,7 @@ AWS_REGION={{ vault_s3_delivery_aws_region }} ACCOUNT_ACTIVE_DURATION_MIN=7 ACCOUNT_COOLDOWN_DURATION_MIN=30 STRESS_POLICY_INBOX_QUEUE=dev_stress_inbox + +# --- Stress Test Environment Names --- +YTOPS_AUTH_ENV={{ stress_auth_env | default('sim_auth') }} +YTOPS_DOWNLOAD_ENV={{ stress_download_env | default('sim_download') }} diff --git a/tools/generate-profile-setup-policy.py b/tools/generate-profile-setup-policy.py index 5757345..4ced957 100755 --- a/tools/generate-profile-setup-policy.py +++ b/tools/generate-profile-setup-policy.py @@ -121,20 +121,214 @@ def generate_policy(cluster_config, output_path): print(f"Successfully generated profile setup policy at: {output_path}") +def generate_enforcer_policy(cluster_config, output_path): + """Generate the enforcer policy file.""" + all_workers = cluster_config.get('workers', {}) + + enforcement_pools = [] + for worker_name, worker_config in sorted(all_workers.items()): + all_prefixes = [] + for pool in worker_config.get('profile_pools', []): + all_prefixes.extend(pool.get('prefixes', [])) + + if not all_prefixes: + continue + + pool_entry = OrderedDict([ + ('name', f"server_{worker_name}"), + ('profile_group_patterns', sorted(list(set(all_prefixes)))), + ('max_active_profiles', 1) + ]) + enforcement_pools.append(pool_entry) + + with open(output_path, 'w') as f: + f.write("# Policy for the unified simulation enforcer.\n") + f.write("# This file is used by `bin/ytops-client policy-enforcer --live` to manage\n") + f.write("# both the authentication and download simulation environments from a single process.\n\n") + f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n") + f.write("# !!! DO NOT EDIT. Your changes will be overwritten. !!!\n") + f.write("# !!! Edit cluster.green.yml and re-run the generator instead. !!!\n\n") + + f.write("simulation_parameters:\n") + f.write(" # --- Common Redis settings for all tools ---\n") + f.write(" # The enforcer will connect to two different Redis environments (key prefixes)\n") + f.write(" # based on these settings, applying the corresponding policies to each.\n") + f.write(' env_file: ".env"\n') + f.write(' auth_env: "sim_auth"\n') + f.write(' download_env: "sim_download"\n') + f.write(" \n") + f.write(" # How often the enforcer should wake up and apply all policies.\n") + f.write(" interval_seconds: 2\n\n") + + f.write("# --- Common & Pool-specific Settings ---\n") + f.write("# Common settings are applied to all profile groups discovered via the pools below.\n") + f.write("# A pool can optionally override these settings by defining its own 'group_settings' block.\n") + f.write("common_group_settings:\n") + f.write(" auth:\n") + f.write(" max_active_profiles: 1\n") + f.write(" rotate_after_requests: 5\n") + f.write(" rest_duration_minutes_on_rotation: 0.20\n") + f.write(" wait_download_finish_per_group: true\n") + f.write(" max_wait_for_downloads_minutes: 240\n") + f.write(" download:\n") + f.write(" max_active_profiles: 1\n") + f.write(" rotate_after_requests: 0\n") + f.write(" rest_duration_minutes_on_rotation: 0.2\n\n") + + f.write("# Defines pools of profile groups with their own concurrency limits.\n") + f.write("enforcement_pools:\n") + + for pool in enforcement_pools: + f.write(f' - name: "{pool["name"]}"\n') + patterns_str = ", ".join([f'"{p}"' for p in pool['profile_group_patterns']]) + f.write(f' profile_group_patterns: [{patterns_str}]\n') + f.write(f' max_active_profiles: {pool["max_active_profiles"]}\n') + + rest_of_file = """ +# --- Policies for the Authentication Simulation --- +auth_policy_enforcer_config: + + # Ban if 2 failures occur within a 1-minute window. + #ban_on_failures: 2 + #ban_on_failures_window_minutes: 1 + + # The standard rest policy is disabled, as rotation is handled by the profile group. + + # New rate limit policy to enforce requests-per-hour limits. + # For guest sessions, the limit is ~300 videos/hour. + rate_limit_requests: 0 + rate_limit_window_minutes: 60 + rate_limit_rest_duration_minutes: 5 + + rest_after_requests: 0 + rest_duration_minutes: 10 + + # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest + # sessions is ~300 videos/hour (~1000 webpage/player requests per hour). + # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour). + # The settings below should be configured to respect these limits. + + # New setting for load balancing across profile groups. + # "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time). + # This is a global FIFO strategy that effectively cycles through profiles regardless of their group. + # "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads. + # If multiple groups have zero pending downloads, it acts as a FIFO queue, activating + # the one that finished its last download batch the earliest. This is useful when you want + # to ensure a group finishes its entire workload before another group starts. + profile_selection_strategy: "longest_idle" + + # The 'global_max_active_profiles' setting is now superseded by the per-pool limits + # defined in the 'enforcement_pools' section. + + # The 'profile_groups' section is now inherited from 'profile_group_definitions' above. + # The enforcer logic should be updated to read from there. + + proxy_work_minutes: 0 + proxy_rest_duration_minutes: 0 + + # Global maximum time a proxy can be active before being rested, regardless of + # other rules. Acts as a safety net. Set to 0 to disable. + max_global_proxy_active_minutes: 0 + rest_duration_on_max_active: 10 + + # Proxy-level ban on failure burst is disabled. + proxy_ban_on_failures: 0 + proxy_ban_window_minutes: 2 + + # Clean up locks held for more than 16 minutes (960s) to prevent stuck workers. + # This should be longer than the docker container timeout (15m). + unlock_stale_locks_after_seconds: 960 + + # A short post-task cooldown for auth simulation profiles. When a batch is finished, + # the profile is put into COOLDOWN briefly. This prevents a worker from immediately + # re-locking the same profile, giving the policy enforcer a window to perform rotation. + unlock_cooldown_seconds: 0 + +# --- Cross-simulation synchronization --- +# This section is simplified because the link between auth and download profiles +# is now defined in the `profile_group_definitions`. +cross_simulation_sync: + # Which states to synchronize from auth to download. + sync_states: + - "BANNED" + # If true, a BANNED state on an auth profile will force the download profile to also be BANNED. + enforce_auth_lead: true + # CRITICAL: Ensures the correct download profile GROUP is active. + sync_active_profile: true + # When an auth profile is in the 'waiting_downloads' state, ensure the matching download profile is active. + sync_waiting_downloads: true + +# --- Policies for the Download Simulation --- +download_policy_enforcer_config: + + # Ban if 1 failure occurs within a 1-minute window. + ban_on_failures: 1 + ban_on_failures_window_minutes: 1 + + # Standard rest policy is disabled in favor of group rotation. + + # New rate limit policy to enforce requests-per-hour limits. + # For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe. + rate_limit_requests: 280 + rate_limit_window_minutes: 60 + rate_limit_rest_duration_minutes: 5 + rest_after_requests: 0 + rest_duration_minutes: 20 + + # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest + # sessions is ~300 videos/hour (~1000 webpage/player requests per hour). + # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour). + # The settings below should be configured to respect these limits. + + # The 'profile_groups' section is now inherited from 'profile_group_definitions' above. + # The enforcer logic should be updated to read from there. + + # Time-based proxy rules are disabled. + proxy_work_minutes: 0 + proxy_rest_duration_minutes: 10 + + # Global maximum time a proxy can be active before being rested, regardless of + # other rules. Acts as a safety net. Set to 0 to disable. + max_global_proxy_active_minutes: 0 + rest_duration_on_max_active: 10 + + # Proxy-level ban on failure burst is disabled. + proxy_ban_on_failures: 3 + proxy_ban_window_minutes: 1 + + # Clean up download locks held for more than 16 minutes (960s) to allow for long downloads. + # This should be longer than the docker container timeout (15m). + unlock_stale_locks_after_seconds: 960 + + # After a profile is used for a download, unlock it but put it in COOLDOWN + # state for 2-3s. This is enforced by the worker, which reads this config from Redis. + unlock_cooldown_seconds: [2, 3] +""" + with open(output_path, 'a') as f: + f.write(rest_of_file) + + print(f"Successfully generated enforcer policy at: {output_path}") + + def main(): - if len(sys.argv) != 3: - print("Usage: ./tools/generate-profile-setup-policy.py ") + if len(sys.argv) < 3 or len(sys.argv) > 4: + print("Usage: ./tools/generate-profile-setup-policy.py []") sys.exit(1) - + config_path = sys.argv[1] - output_path = sys.argv[2] - + profile_output_path = sys.argv[2] + if not os.path.exists(config_path): print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr) sys.exit(1) - + cluster_config = load_cluster_config(config_path) - generate_policy(cluster_config, output_path) + generate_policy(cluster_config, profile_output_path) + + if len(sys.argv) == 4: + enforcer_output_path = sys.argv[3] + generate_enforcer_policy(cluster_config, enforcer_output_path) + if __name__ == "__main__": main() diff --git a/ytops_client-source/policies/12_queue_auth_simulation.yaml b/ytops_client-source/policies/12_queue_auth_simulation.yaml index 96b5db4..f6c8dad 100644 --- a/ytops_client-source/policies/12_queue_auth_simulation.yaml +++ b/ytops_client-source/policies/12_queue_auth_simulation.yaml @@ -205,6 +205,18 @@ queue_policy: # Example: formats_to_download: "all" # Example: formats_to_download: ["140-dashy", "299-dashy"] formats_to_download: "from_download_policy" + + # Whether to report completion back to a queue. Always reported for auth. + report_completion: true + + # Queue to report completion to + completion_queue: "queue2_auth_completed" + + # Queue to report failures to + failure_queue: "queue2_auth_fail" + + # Queue to report skipped tasks to + skipped_queue: "queue2_auth_skipped" simulation_parameters: auth_env: "sim_auth" diff --git a/ytops_client-source/policies/8_unified_simulation_enforcer.yaml b/ytops_client-source/policies/8_unified_simulation_enforcer.yaml index 348ec32..b72d153 100644 --- a/ytops_client-source/policies/8_unified_simulation_enforcer.yaml +++ b/ytops_client-source/policies/8_unified_simulation_enforcer.yaml @@ -13,22 +13,29 @@ simulation_parameters: # How often the enforcer should wake up and apply all policies. interval_seconds: 2 -# --- Dynamic Profile Group Templates --- -# The policy enforcer will find all profile prefixes matching a pattern in Redis -# and apply the settings from the matching template. This avoids having to list -# every profile group manually. -# NOTE: The policy enforcer tool must be updated to support this format. -profile_group_templates: - - pattern: "user*" - auth: - max_active_profiles: 1 - rotate_after_requests: 5 - rest_duration_minutes_on_rotation: 0.20 - wait_download_finish_per_group: true - max_wait_for_downloads_minutes: 240 - download: - rotate_after_requests: 0 - rest_duration_minutes_on_rotation: 0.2 +# --- Common & Pool-specific Settings --- +# Common settings are applied to all profile groups discovered via the pools below. +# A pool can optionally override these settings by defining its own 'group_settings' block. +common_group_settings: + auth: + max_active_profiles: 1 + rotate_after_requests: 5 + rest_duration_minutes_on_rotation: 0.20 + wait_download_finish_per_group: true + max_wait_for_downloads_minutes: 240 + download: + max_active_profiles: 1 + rotate_after_requests: 0 + rest_duration_minutes_on_rotation: 0.2 + +# Defines pools of profile groups with their own concurrency limits. +enforcement_pools: + - name: "server_dl003_pool" + profile_group_patterns: ["user31", "user32"] + max_active_profiles: 1 + - name: "server_dl006_pool" + profile_group_patterns: ["user61", "user62"] + max_active_profiles: 1 # --- Policies for the Authentication Simulation --- auth_policy_enforcer_config: @@ -62,9 +69,8 @@ auth_policy_enforcer_config: # to ensure a group finishes its entire workload before another group starts. profile_selection_strategy: "longest_idle" - # Enforce a total limit of active profiles across all groups defined below. - # Set to 1 to ensure only one group's profile is active at any time. - global_max_active_profiles: 1 + # The 'global_max_active_profiles' setting is now superseded by the per-pool limits + # defined in the 'enforcement_pools' section. # The 'profile_groups' section is now inherited from 'profile_group_definitions' above. # The enforcer logic should be updated to read from there. @@ -106,6 +112,7 @@ cross_simulation_sync: # --- Policies for the Download Simulation --- download_policy_enforcer_config: + # Ban if 1 failure occurs within a 1-minute window. ban_on_failures: 1 ban_on_failures_window_minutes: 1 diff --git a/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml b/ytops_client-source/policies/unused/10_direct_docker_auth_simulation.yaml similarity index 100% rename from ytops_client-source/policies/10_direct_docker_auth_simulation.yaml rename to ytops_client-source/policies/unused/10_direct_docker_auth_simulation.yaml diff --git a/ytops_client-source/policies/13_queue_download_simulation.yaml b/ytops_client-source/policies/unused/13_queue_download_simulation.yaml similarity index 100% rename from ytops_client-source/policies/13_queue_download_simulation.yaml rename to ytops_client-source/policies/unused/13_queue_download_simulation.yaml diff --git a/ytops_client-source/ytops_client/policy_enforcer_tool.py b/ytops_client-source/ytops_client/policy_enforcer_tool.py index 4ec1d38..37f1c83 100644 --- a/ytops_client-source/ytops_client/policy_enforcer_tool.py +++ b/ytops_client-source/ytops_client/policy_enforcer_tool.py @@ -68,6 +68,9 @@ class PolicyEnforcer: all_profiles_list = self.manager.list_profiles() all_profiles_map = {p['name']: p for p in all_profiles_list} + # Sync profile states from their assigned proxy's state (e.g., if proxy is BANNED, ban profile). + self.enforce_proxy_state_on_profiles(all_profiles_list, all_profiles_map) + # Apply profile group policies (rotation, max_active). This will modify the local `all_profiles_map`. self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map, args) @@ -197,12 +200,26 @@ class PolicyEnforcer: live_active_counts[group_name] = count logger.debug(f"Initial live active counts: {live_active_counts}") - # --- New Global Max Active Logic --- + # --- New Enforcement Pool and Global Max Active Logic --- + enforcement_pools = getattr(args, 'enforcement_pools', []) + live_pool_active_counts = {} + if enforcement_pools: + for i, pool in enumerate(enforcement_pools): + pool_name = pool.get('name', f'pool_{i}') + live_pool_active_counts[pool_name] = 0 + + for group_name, count in live_active_counts.items(): + group_policy = next((g for g in profile_groups if g.get('name') == group_name), {}) + pool_name = group_policy.get('pool_name') + if pool_name: + live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + count + logger.debug(f"Initial live pool active counts: {live_pool_active_counts}") + global_max_active = getattr(args, 'global_max_active_profiles', 0) live_global_active_count = sum(live_active_counts.values()) if global_max_active > 0: logger.debug(f"Enforcing global max active profiles limit of {global_max_active}. Current global active: {live_global_active_count}") - # --- End New Global Logic --- + # --- End New Logic --- # --- End group logic setup --- @@ -278,7 +295,7 @@ class PolicyEnforcer: # --- End new logic --- # --- New Sorting Logic based on Profile Selection Strategy --- - strategy = getattr(args, 'profile_selection_strategy', 'longest_idle') + strategy = getattr(args, 'profile_selection_strategy', None) if strategy == 'least_loaded' and profile_groups: logger.debug("Applying 'least_loaded' profile selection strategy.") # Separate profiles that are ready from those that are not @@ -345,10 +362,8 @@ class PolicyEnforcer: profiles_to_check = sorted_ready_profiles + not_ready_profiles logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") - else: # Default 'longest_idle' sort - if strategy not in ['longest_idle']: - logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'longest_idle'.") - + elif strategy == 'longest_idle': + logger.debug("Applying 'longest_idle' profile selection strategy.") # Separate profiles that are ready to be activated from those still resting. # A profile waiting for downloads is NOT considered ready for activation. ready_profiles = [ @@ -369,6 +384,11 @@ class PolicyEnforcer: # The final list to check will process all ready profiles first, then wait for the not-ready ones. profiles_to_check = ready_profiles + not_ready_profiles logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") + else: # Default sort (no strategy) + if strategy: # Log a warning if an unknown strategy was provided + logger.warning(f"Unknown profile_selection_strategy '{strategy}'. Using default FIFO sort by rest time.") + # Default to a simple FIFO sort based on when their rest period ends. + profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', '')))) # --- End New Sorting Logic --- # --- New logic: Identify groups with waiting profiles --- @@ -490,13 +510,30 @@ class PolicyEnforcer: profile_name = profile['name'] group_name = profile_to_group_map.get(profile_name) - # --- New Global Max Active Check --- - # This check prevents NEW profiles (in RESTING state) from becoming active if the global limit is reached. - # It allows COOLDOWN profiles to become active, as they are already part of the active count. - if global_max_active > 0 and live_global_active_count >= global_max_active and profile['state'] == ProfileState.RESTING.value: - logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.") - continue - # --- End New Global Check --- + # --- New Pool and Global Max Active Check --- + is_new_activation = profile['state'] == ProfileState.RESTING.value + if is_new_activation: + # Check pool limits first + if enforcement_pools and group_name: + group_policy = next((g for g in profile_groups if g.get('name') == group_name), {}) + pool_name = group_policy.get('pool_name') + pool_config = None + for i, p in enumerate(enforcement_pools): + if p.get('name', f'pool_{i}') == pool_name: + pool_config = p + break + if pool_config: + pool_max_active = pool_config.get('max_active_profiles', 0) + current_pool_active = live_pool_active_counts.get(pool_name, 0) + if pool_max_active > 0 and current_pool_active >= pool_max_active: + logger.debug(f"Profile '{profile_name}' rest ended, but pool '{pool_name}' max active limit ({pool_max_active}) has been reached. Deferring activation.") + continue + + # Then check global limit if it's still configured (for backward compatibility) + if global_max_active > 0 and live_global_active_count >= global_max_active: + logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.") + continue + # --- End New Check --- # --- Group-aware unrest check --- if group_name: @@ -577,11 +614,13 @@ class PolicyEnforcer: continue # Skip activation for this profile # --- End group check --- - # Before activating, ensure the profile's proxy is not resting. + # Before activating, ensure the profile's proxy is not resting or banned. proxy_url = profile.get('proxy') if proxy_url: proxy_state_data = proxy_states.get(proxy_url, {}) - if proxy_state_data.get('state') == ProfileState.RESTING.value: + proxy_state = proxy_state_data.get('state') + + if proxy_state == ProfileState.RESTING.value: logger.debug(f"Profile '{profile['name']}' rest period ended, but its proxy '{proxy_url}' is still resting. Deferring activation.") # Update reason for clarity in the UI when a profile is blocked by its proxy. @@ -595,11 +634,33 @@ class PolicyEnforcer: all_profiles_map[profile_name]['rest_reason'] = new_reason continue # Do not activate this profile yet. + + elif proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value: + # This profile is about to be activated, but its proxy is banned. Ban it. + reason = f"Proxy '{proxy_url}' is BANNED" + logger.warning(f"Banning profile '{profile['name']}' because its proxy is banned: {reason}") + self.actions_taken_this_cycle += 1 + if not self.dry_run: + sm = self.manager.get_state_machine(profile_name) + if sm: + sm.ban(reason=reason) + # Update local map + all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value + all_profiles_map[profile_name]['reason'] = reason + continue # Skip activation; it is now banned. - # Update group counter BEFORE making any changes, so subsequent checks in this cycle use the updated count + # Update group and pool counters BEFORE making any changes, so subsequent checks in this cycle use the updated count if group_name and profile['state'] == ProfileState.RESTING.value: # For RESTING profiles, they're becoming active, so increment the count live_active_counts[group_name] = live_active_counts.get(group_name, 0) + 1 + + # Also increment the pool counter + if enforcement_pools: + group_policy = next((g for g in profile_groups if g.get('name') == group_name), {}) + pool_name = group_policy.get('pool_name') + if pool_name: + live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + 1 + # Also increment the global counter if global_max_active > 0: live_global_active_count += 1 @@ -903,20 +964,68 @@ class PolicyEnforcer: if num_active_or_locked == 0: logger.debug(f"Group '{group_name}' has no active profiles. `enforce_unrest_policy` will attempt to activate one.") - # --- 4. Global Self-Healing: Enforce global_max_active_profiles --- - # This runs after all per-group healing and ensures the global limit is respected. + # --- 4. Pool and Global Self-Healing --- + enforcement_pools = getattr(args, 'enforcement_pools', []) + if enforcement_pools: + for i, pool in enumerate(enforcement_pools): + pool_name = pool.get('name', f'pool_{i}') + pool_max_active = pool.get('max_active_profiles', 0) + if not pool_max_active or pool_max_active <= 0: + continue + + # Get all profile names belonging to this pool + pool_profile_names = set() + for group in profile_groups: + if group.get('pool_name') == pool_name: + prefix = group.get('prefix') + if prefix: + for p_name in all_profiles_map: + if p_name.startswith(prefix): + pool_profile_names.add(p_name) + + # Get current active count for this pool from our local map + current_pool_active = [ + p for name, p in all_profiles_map.items() + if name in pool_profile_names and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value] + ] + + num_pool_active = len(current_pool_active) + if num_pool_active > pool_max_active: + logger.warning(f"Pool Healing ('{pool_name}'): Found {num_pool_active} active profiles, but pool max is {pool_max_active}. Resting excess.") + + profiles_that_can_be_rested = [p for p in current_pool_active if p['state'] == ProfileState.ACTIVE.value] + profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) + profiles_that_can_be_rested.sort(key=lambda p: ( + p.get('success_count', 0) + p.get('failure_count', 0) + + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0) + ), reverse=True) + + num_to_rest = num_pool_active - pool_max_active + for profile in profiles_that_can_be_rested[:num_to_rest]: + logger.warning(f"Pool Healing ('{pool_name}'): Resting profile '{profile['name']}'.") + self.actions_taken_this_cycle += 1 + if not self.dry_run: + sm = self.manager.get_state_machine(profile['name']) + if sm: + sm.rest(reason=f"Pool '{pool_name}' max_active healing", duration_minutes=0.02) + + all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value + all_profiles_map[profile['name']]['rest_reason'] = f"Pool '{pool_name}' max_active healing" + all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60) + + # For backward compatibility, also enforce global_max_active_profiles if it is set. global_max_active = getattr(args, 'global_max_active_profiles', 0) if global_max_active > 0: # Get all profiles managed by any group all_grouped_profiles = set() for group in profile_groups: - profiles_in_group = set() - if 'profiles' in group: - profiles_in_group = set(group['profiles']) - elif 'prefix' in group: + if 'prefix' in group: prefix = group['prefix'] - profiles_in_group = {p['name'] for p in all_profiles_list if p['name'].startswith(prefix)} - all_grouped_profiles.update(profiles_in_group) + for p_name in all_profiles_map: + if p_name.startswith(prefix): + all_grouped_profiles.add(p_name) + elif 'profiles' in group: + all_grouped_profiles.update(group['profiles']) # Get current active count across all groups from our local map current_global_active = [ @@ -928,29 +1037,23 @@ class PolicyEnforcer: if num_global_active > global_max_active: logger.warning(f"Global Healing: Found {num_global_active} active profiles across all groups, but global max is {global_max_active}. Resting excess.") - # We can only rest profiles that are in the ACTIVE state, not LOCKED. profiles_that_can_be_rested = [p for p in current_global_active if p['state'] == ProfileState.ACTIVE.value] - - # Sort to determine which profiles to rest, using the same logic as per-group healing. - profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) # Higher name first + profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) profiles_that_can_be_rested.sort(key=lambda p: ( p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0) - ), reverse=True) # Most requests first + ), reverse=True) num_to_rest = num_global_active - global_max_active - profiles_to_rest = profiles_that_can_be_rested[:num_to_rest] - for profile in profiles_to_rest: + for profile in profiles_that_can_be_rested[:num_to_rest]: logger.warning(f"Global Healing: Resting profile '{profile['name']}'.") self.actions_taken_this_cycle += 1 if not self.dry_run: sm = self.manager.get_state_machine(profile['name']) if sm: - # Rest for a minimal duration to prevent immediate re-activation in the same cycle. - sm.rest(reason="Global max_active healing", duration_minutes=0.02) # ~1.2 seconds + sm.rest(reason="Global max_active healing", duration_minutes=0.02) - # Update local map to reflect the change for this cycle all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value all_profiles_map[profile['name']]['rest_reason'] = "Global max_active healing" all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60) @@ -1062,6 +1165,10 @@ class PolicyEnforcer: for proxy_url, state_data in proxy_states.items(): state = state_data.get('state', ProfileState.ACTIVE.value) + if state == ProfileState.BANNED.value: + logger.debug(f"Proxy '{proxy_url}' is BANNED. Skipping work/rest cycle enforcement.") + continue + # Un-rest logic if state == ProfileState.RESTING.value: rest_until = state_data.get('rest_until', 0) @@ -1257,6 +1364,60 @@ class PolicyEnforcer: return True # Indicates action was taken return False + def enforce_proxy_state_on_profiles(self, all_profiles_list, all_profiles_map): + """ + Enforces the state of a proxy onto all profiles that use it. + - If a proxy is BANNED, any non-banned profile using it will be banned. + - If a proxy is RESTING, any ACTIVE profile using it will be rested. + This is a safeguard that runs after all proxy state changes and before profile + state logic. + """ + unique_proxies = sorted(list(set(p['proxy'] for p in all_profiles_list if p.get('proxy')))) + if not unique_proxies: + return + + proxy_states = self.manager.get_proxy_states(unique_proxies) + + for profile in all_profiles_list: + proxy_url = profile.get('proxy') + if not proxy_url: + continue + + proxy_state_data = proxy_states.get(proxy_url) + if not proxy_state_data: + continue + + proxy_state = proxy_state_data.get('state') + profile_name = profile['name'] + + if proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value: + reason = f"Proxy '{proxy_url}' is BANNED" + logger.warning(f"Banning profile '{profile_name}' because its proxy is banned: {reason}") + self.actions_taken_this_cycle += 1 + if not self.dry_run: + sm = self.manager.get_state_machine(profile_name) + if sm: + sm.ban(reason=reason) + # Update local map for consistency in this cycle + all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value + all_profiles_map[profile_name]['reason'] = reason + + elif proxy_state == ProfileState.RESTING.value and profile['state'] == ProfileState.ACTIVE.value: + logger.info(f"Resting profile '{profile_name}' because its proxy '{proxy_url}' is resting.") + self.actions_taken_this_cycle += 1 + if not self.dry_run: + # Rest it for as long as the proxy is resting. + proxy_rest_until = proxy_state_data.get('rest_until', 0) + duration_minutes = max(0, (proxy_rest_until - time.time()) / 60) + sm = self.manager.get_state_machine(profile_name) + if sm: + sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=duration_minutes) + # Update local map for consistency in this cycle + proxy_rest_until = proxy_state_data.get('rest_until', 0) + all_profiles_map[profile_name]['state'] = ProfileState.RESTING.value + all_profiles_map[profile_name]['rest_reason'] = self.PROXY_REST_REASON + all_profiles_map[profile_name]['rest_until'] = proxy_rest_until + def add_policy_enforcer_parser(subparsers): """Adds the parser for the 'policy-enforcer' command.""" parser = subparsers.add_parser( @@ -1421,6 +1582,10 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F logger.debug("Syncing active profiles from Auth to Download simulation...") + # Get all download proxy states once for efficiency + all_dl_proxies = sorted(list(set(p['proxy'] for p in all_download_profiles.values() if p.get('proxy')))) + all_dl_proxy_states = download_manager.get_proxy_states(all_dl_proxies) + # Get profiles that should be active in the download simulation target_active_download_profiles = set() @@ -1465,6 +1630,13 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F logger.warning(f"Auth profile '{target_profile_name}' needs an active download profile, but no corresponding download profile found.") continue + # Check proxy state before activating + proxy_url = download_profile.get('proxy') + proxy_state = all_dl_proxy_states.get(proxy_url, {}).get('state') + if proxy_state in [ProfileState.BANNED.value, ProfileState.RESTING.value]: + logger.debug(f"Sync: Deferring activation of download profile '{target_profile_name}' because its proxy '{proxy_url}' is {proxy_state}.") + continue + if download_profile['state'] not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: is_from_cooldown = download_profile['state'] == ProfileState.COOLDOWN.value log_msg_suffix = " (from COOLDOWN)" if is_from_cooldown else "" @@ -1476,24 +1648,12 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F sm.activate(profile=download_profile) # --- Group-Aware Deactivation --- - # Identify the target download groups based on target_active_download_profiles. - # CRITICAL FIX: Directly map individual profiles to their groups instead of relying on name patterns. - target_download_groups = set() - - for target_profile_name in target_active_download_profiles: - group_info = dl_profile_to_group.get(target_profile_name) - if group_info: - target_download_groups.add(group_info['name']) - - logger.debug(f"Target download groups for this sync cycle: {target_download_groups}") - - # Deactivate any download profiles that are active but are not in a target group + # Deactivate any download profiles that are active but are not the target profile for their group. + # This ensures that if auth wants 'user1_1' to be active, the currently active 'user1_0' is rested first. for dl_profile_name, dl_profile in all_download_profiles.items(): if dl_profile['state'] == ProfileState.ACTIVE.value: - group_info = dl_profile_to_group.get(dl_profile_name) - # If the profile is in a group, and that group is NOT a target group, rest it. - if group_info and group_info['name'] not in target_download_groups: - logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as its group '{group_info['name']}' is no longer active.") + if dl_profile_name not in target_active_download_profiles: + logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as it is no longer the target active profile for its group.") if not dry_run: sm = download_manager.get_state_machine(dl_profile_name) if sm: @@ -1539,9 +1699,9 @@ def main_policy_enforcer(args): 'unlock_stale_locks_after_seconds': 120, 'unlock_cooldown_seconds': 0, 'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10, - 'profile_selection_strategy': 'longest_idle', + 'profile_selection_strategy': None, 'global_max_active_profiles': 0, - 'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [] + 'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [], 'enforcement_pools': [] } sim_params = policy.get('simulation_parameters', {}) @@ -1586,10 +1746,90 @@ def main_policy_enforcer(args): logger.info(f"Setting up enforcer for {sim_type} simulation...") - # --- Dynamic Profile Group Discovery --- - profile_group_templates = policy.get('profile_group_templates') - # Check if templates exist and if the config block doesn't already have groups (CLI overrides take precedence) - if profile_group_templates and 'profile_groups' not in policy_config: + # --- Hybrid Profile Group Discovery (Static + Dynamic) --- + common_group_settings = policy.get('common_group_settings', {}) + enforcement_pools = policy.get('enforcement_pools') + profile_group_templates = policy.get('profile_group_templates') # For backward compatibility + + # Start with any statically defined groups from the policy. + final_profile_groups = policy_config.get('profile_groups', []) + + # If enforcement_pools are defined, discover dynamic groups and merge them. + if enforcement_pools: + logger.info(f"Found 'enforcement_pools'. Discovering dynamic profile groups to merge with static ones for {sim_type}...") + + # Determine key_prefix to connect to the right Redis env + policy_env = sim_params.get(env_policy_key) + default_policy_env = sim_params.get('env') + effective_env = env_cli_arg or args.env or policy_env or default_policy_env or 'dev' + if args.key_prefix: temp_key_prefix = args.key_prefix + elif args.legacy: temp_key_prefix = 'profile_mgmt_' + else: temp_key_prefix = f"{effective_env}_profile_mgmt_" + + try: + temp_manager = ProfileManager(redis_host, redis_port, redis_password, temp_key_prefix, redis_db) + all_profiles = temp_manager.list_profiles() + found_prefixes = set(p['name'].rsplit('_', 1)[0] for p in all_profiles) + + if not found_prefixes: + logger.warning(f"Dynamic discovery found no profile prefixes for env '{effective_env}'.") + else: + logger.info(f"Discovered {len(found_prefixes)} unique profile prefixes: {sorted(list(found_prefixes))}") + + dynamically_generated_groups = [] + # Match discovered prefixes against patterns in each enforcement pool + for i, pool in enumerate(enforcement_pools): + pool_name = pool.get('name', f'pool_{i}') + pool_patterns = pool.get('profile_group_patterns', []) + # If a pool has no patterns, it's just for defining concurrency for static groups. Skip discovery. + if not pool_patterns: + continue + + # Merge common settings with any pool-specific overrides + group_settings_template = deepcopy(common_group_settings) + pool_specific_settings = pool.get('group_settings', {}) + + # A simple way to deep merge the two levels (auth/download) + auth_settings = group_settings_template.get('auth', {}) + auth_settings.update(pool_specific_settings.get('auth', {})) + group_settings_template['auth'] = auth_settings + + download_settings = group_settings_template.get('download', {}) + download_settings.update(pool_specific_settings.get('download', {})) + group_settings_template['download'] = download_settings + + for prefix in sorted(list(found_prefixes)): + for pattern in pool_patterns: + if fnmatch.fnmatch(prefix, pattern): + sim_settings = group_settings_template.get(sim_type.lower()) + if not sim_settings: + logger.debug(f"Pool '{pool_name}' has no settings for '{sim_type}'. Skipping for prefix '{prefix}'.") + continue + + new_group = deepcopy(sim_settings) + new_group['prefix'] = prefix + new_group['name'] = prefix + new_group['pool_name'] = pool_name + + dynamically_generated_groups.append(new_group) + logger.debug(f"Assigned prefix '{prefix}' to pool '{pool_name}' for {sim_type} simulation.") + break + + if dynamically_generated_groups: + logger.info(f"Merging {len(final_profile_groups)} static group(s) with {len(dynamically_generated_groups)} discovered dynamic group(s).") + final_profile_groups.extend(dynamically_generated_groups) + + except Exception as e: + logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose) + + # Update the policy_config with the final merged list and the pool definitions + policy_config['profile_groups'] = final_profile_groups + # CRITICAL: Deepcopy enforcement_pools to prevent modification in one simulation + # from affecting the other, since the policy object is shared. + policy_config['enforcement_pools'] = deepcopy(enforcement_pools) + + # For backward compatibility with the old template format + elif profile_group_templates and 'profile_groups' not in policy_config: logger.info(f"Found 'profile_group_templates'. Discovering profile groups dynamically for {sim_type}...") # Determine key_prefix to connect to the right Redis env (logic duplicated from below) @@ -1639,6 +1879,21 @@ def main_policy_enforcer(args): policy_config['profile_groups'] = generated_groups except Exception as e: logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose) + + # In the download simulation, the active profiles are dictated entirely by the + # cross-simulation sync logic. We must disable the download enforcer's own + # concurrency limits (max_active_profiles) to prevent it from "healing" + # profiles that the sync logic has correctly activated. + if sim_type == 'Download': + logger.info("Disabling max_active_profiles limits for Download simulation. Active profiles will be managed by cross-sim sync.") + if 'profile_groups' in policy_config: + for group in policy_config['profile_groups']: + group['max_active_profiles'] = 0 + if 'enforcement_pools' in policy_config: + for pool in policy_config['enforcement_pools']: + pool['max_active_profiles'] = 0 + # Also disable the global limit for the download simulation. + policy_config['global_max_active_profiles'] = 0 config = Config(args, policy_config, code_defaults) diff --git a/ytops_client-source/ytops_client/profile_manager_tool.py b/ytops_client-source/ytops_client/profile_manager_tool.py index 96d1687..f5832e8 100644 --- a/ytops_client-source/ytops_client/profile_manager_tool.py +++ b/ytops_client-source/ytops_client/profile_manager_tool.py @@ -18,8 +18,10 @@ import time from datetime import datetime from typing import Dict, List, Optional, Any import collections +import fnmatch import redis +import yaml from .profile_statemachine import ProfileState, ProfileStateMachine @@ -199,8 +201,8 @@ class ProfileManager: # When decrementing, ensure the counter exists to avoid creating negative counters from stray calls. if count < 0 and not self.redis.exists(key): - logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. No action taken.") - return None + logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.") + return 0 new_value = self.redis.incrby(key, count) @@ -225,8 +227,8 @@ class ProfileManager: # Only decrement if the key exists. This prevents stray calls from creating negative counters. if not self.redis.exists(key): - logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. No action taken.") - return None + logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.") + return 0 new_value = self.redis.decr(key) @@ -444,7 +446,12 @@ class ProfileManager: logger.error(f"Invalid state: {new_state}") return False - sm = self.get_state_machine(name) + profile = self.get_profile(name) + if not profile: + # get_profile already logs an error if the profile is not found. + return False + + sm = self.get_state_machine(name, profile=profile) if not sm: return False # get_state_machine logs the error @@ -453,14 +460,16 @@ class ProfileManager: return True try: + # Pass the profile object to the transition methods for context, + # which is consistent with the policy enforcer's usage. if new_state == ProfileState.ACTIVE.value: - sm.activate() + sm.activate(profile=profile) elif new_state == ProfileState.BANNED.value: - sm.ban(reason=reason) + sm.ban(reason=reason, profile=profile) elif new_state == ProfileState.RESTING.value: - sm.rest(reason=reason) + sm.rest(reason=reason, profile=profile) elif new_state == ProfileState.PAUSED.value: - sm.pause(reason=reason) + sm.pause(reason=reason, profile=profile) # LOCKED and COOLDOWN are not handled here as they are special transitions # from lock_profile and unlock_profile, and should not be set directly. elif new_state in [ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]: @@ -720,30 +729,51 @@ class ProfileManager: logger.info(f"Deleted {deleted_count} global counter key(s).") return deleted_count - def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None) -> bool: + def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None, reason: Optional[str] = None) -> bool: """Set the state of a proxy and propagates it to associated profiles.""" - if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value]: - logger.error(f"Invalid proxy state: {state}. Only ACTIVE and RESTING are supported for proxies.") + if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value, ProfileState.BANNED.value]: + logger.error(f"Invalid proxy state: {state}. Only ACTIVE, RESTING, and BANNED are supported for proxies.") return False proxy_key = self._proxy_state_key(proxy_url) now = time.time() updates = {'state': state} + if reason: + updates['reason'] = reason + else: + # Clear reason if not provided + updates['reason'] = '' rest_until = 0 if state == ProfileState.RESTING.value: - if not rest_duration_minutes or rest_duration_minutes <= 0: + if rest_duration_minutes is None: logger.error("rest_duration_minutes is required when setting proxy state to RESTING.") return False - rest_until = now + rest_duration_minutes * 60 + + if rest_duration_minutes == -1: + # Use a very large number for "indefinite" to avoid special cases later. + # 10 years should be sufficient. + rest_until = now + (10 * 365 * 24 * 60 * 60) + elif rest_duration_minutes > 0: + rest_until = now + rest_duration_minutes * 60 + else: + logger.error("rest_duration_minutes must be positive, or -1 for indefinite.") + return False + updates['rest_until'] = str(rest_until) updates['work_start_timestamp'] = '0' # Clear work start time - else: # ACTIVE + elif state in [ProfileState.ACTIVE.value, ProfileState.BANNED.value]: updates['rest_until'] = '0' - updates['work_start_timestamp'] = str(now) - + if state == ProfileState.ACTIVE.value: + updates['work_start_timestamp'] = str(now) + else: # BANNED + updates['work_start_timestamp'] = '0' + self.redis.hset(proxy_key, mapping=updates) - logger.info(f"Set proxy '{proxy_url}' state to {state}.") + log_msg = f"Set proxy '{proxy_url}' state to {state}." + if reason: + log_msg += f" Reason: {reason}" + logger.info(log_msg) # Now, update associated profiles profiles_on_proxy = self.list_profiles(proxy_filter=proxy_url) @@ -751,16 +781,31 @@ class ProfileManager: return True if state == ProfileState.RESTING.value: - logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'.") + propagate_reason = reason or "Proxy resting" + logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}") for profile in profiles_on_proxy: if profile['state'] == ProfileState.ACTIVE.value: - self.update_profile_state(profile['name'], ProfileState.RESTING.value, "Proxy resting") + self.update_profile_state(profile['name'], ProfileState.RESTING.value, propagate_reason) self.update_profile_field(profile['name'], 'rest_until', str(rest_until)) - elif state == ProfileState.ACTIVE.value: - logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'.") + elif state == ProfileState.BANNED.value: + propagate_reason = reason or "Proxy banned" + logger.info(f"Propagating BANNED state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}") for profile in profiles_on_proxy: - if profile['state'] == ProfileState.RESTING.value and profile.get('rest_reason') == "Proxy resting": - self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, "Proxy activated") + if profile['state'] != ProfileState.BANNED.value: + self.update_profile_state(profile['name'], ProfileState.BANNED.value, propagate_reason) + elif state == ProfileState.ACTIVE.value: + propagate_reason = reason or "Proxy activated" + logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}") + for profile in profiles_on_proxy: + # Check for proxy-related reasons in both rest_reason and ban_reason + proxy_related_reason = False + if profile.get('rest_reason', '').startswith("Proxy "): + proxy_related_reason = True + if profile.get('ban_reason', '').startswith("Proxy "): + proxy_related_reason = True + + if (profile['state'] in [ProfileState.RESTING.value, ProfileState.BANNED.value]) and proxy_related_reason: + self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, propagate_reason) return True @@ -1162,8 +1207,8 @@ def add_profile_manager_parser(subparsers): # List command list_parser = subparsers.add_parser('list', help='List profiles', parents=[common_parser]) - list_parser.add_argument('--auth-env', help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view.') - list_parser.add_argument('--download-env', help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view.') + list_parser.add_argument('--auth-env', default=None, help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view. Defaults to YTOPS_AUTH_ENV env var.') + list_parser.add_argument('--download-env', default=None, help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view. Defaults to YTOPS_DOWNLOAD_ENV env var.') list_parser.add_argument('--separate-views', action='store_true', help='In dual-monitor mode, show two separate reports instead of a single merged view.') list_parser.add_argument('--rest-after-requests', type=int, help='(For display) Show countdown to rest based on this request limit.') list_parser.add_argument('--state', help='Filter by state') @@ -1177,20 +1222,24 @@ def add_profile_manager_parser(subparsers): list_parser.add_argument('--no-blink', action='store_true', help='Use ANSI escape codes for smoother screen updates in --live mode (experimental).') list_parser.add_argument('--interval-seconds', type=int, default=5, help='When in --live mode, how often to refresh in seconds. Default: 5.') list_parser.add_argument('--hide-active-state', action='store_true', help="Display 'ACTIVE' state as blank for cleaner UI.") + list_parser.add_argument('--hide-ungrouped', action='store_true', help="Hide profiles that do not belong to any configured profile group (e.g., old profiles after a config change). Shown by default.") # Get command get_parser = subparsers.add_parser('get', help='Get profile details', parents=[common_parser]) get_parser.add_argument('name', help='Profile name') # Set proxy state command - set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy and propagate to its profiles.', parents=[common_parser]) - set_proxy_state_parser.add_argument('proxy_url', help='Proxy URL') - set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING'], help='New state for the proxy') - set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state') + set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy (or proxies) and propagate to its profiles.', parents=[common_parser]) + set_proxy_state_parser.add_argument('proxy_urls', help='Proxy URL, or comma-separated list of URLs') + set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING', 'BANNED'], help='New state for the proxy') + set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state. Use -1 for indefinite rest.') + set_proxy_state_parser.add_argument('--reason', help='Reason for the state change. Propagated to profiles.') + set_proxy_state_parser.add_argument('--auth-env', default=None, help='Target the Auth simulation environment. Can be used with --download-env. Defaults to YTOPS_AUTH_ENV env var.') + set_proxy_state_parser.add_argument('--download-env', default=None, help='Target the Download simulation environment. Can be used with --auth-env. Defaults to YTOPS_DOWNLOAD_ENV env var.') # Update state command - update_state_parser = subparsers.add_parser('update-state', help='Update profile state', parents=[common_parser]) - update_state_parser.add_argument('name', help='Profile name') + update_state_parser = subparsers.add_parser('update-state', help='Update profile state for one or more profiles.', parents=[common_parser]) + update_state_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")') update_state_parser.add_argument('state', choices=ProfileState.values(), help='New state') update_state_parser.add_argument('--reason', help='Reason for state change (especially for BAN)') @@ -1202,21 +1251,21 @@ def add_profile_manager_parser(subparsers): update_field_parser.add_argument('value', help='New value') # Pause command (convenience) - pause_parser = subparsers.add_parser('pause', help=f'Pause a profile (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser]) - pause_parser.add_argument('name', help='Profile name') + pause_parser = subparsers.add_parser('pause', help=f'Pause one or more profiles (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser]) + pause_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")') # Activate command (convenience) - activate_parser = subparsers.add_parser('activate', help=f'Activate a profile (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser]) - activate_parser.add_argument('name', help='Profile name') + activate_parser = subparsers.add_parser('activate', help=f'Activate one or more profiles (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser]) + activate_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")') # Ban command (convenience) - ban_parser = subparsers.add_parser('ban', help=f'Ban a profile (sets state to {ProfileState.BANNED.value}).', parents=[common_parser]) - ban_parser.add_argument('name', help='Profile name') + ban_parser = subparsers.add_parser('ban', help=f'Ban one or more profiles (sets state to {ProfileState.BANNED.value}).', parents=[common_parser]) + ban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")') ban_parser.add_argument('--reason', required=True, help='Reason for ban') # Unban command (convenience) - unban_parser = subparsers.add_parser('unban', help=f'Unban a profile (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser]) - unban_parser.add_argument('name', help='Profile name') + unban_parser = subparsers.add_parser('unban', help=f'Unban one or more profiles (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser]) + unban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")') # Delete command delete_parser = subparsers.add_parser('delete', help='Delete a profile', parents=[common_parser]) @@ -1228,6 +1277,15 @@ def add_profile_manager_parser(subparsers): delete_all_parser = subparsers.add_parser('delete-all', help='(Destructive) Delete all profiles and data under the current key prefix.', parents=[common_parser]) delete_all_parser.add_argument('--confirm', action='store_true', help='Confirm this highly destructive action (required)') + # Cleanup ungrouped command + cleanup_parser = subparsers.add_parser('cleanup-ungrouped', help='(Safe) Sync profiles in Redis with the setup policy (create missing, delete extra).', parents=[common_parser]) + cleanup_parser.add_argument('--policy-file', required=True, help='Path to the profile setup policy YAML file (e.g., policies/6_profile_setup_policy.yaml).') + cleanup_parser.add_argument('--auth-env', default=None, help="Environment name for the Auth simulation to clean. Defaults to YTOPS_AUTH_ENV env var.") + cleanup_parser.add_argument('--download-env', default=None, help="Environment name for the Download simulation to clean. Defaults to YTOPS_DOWNLOAD_ENV env var.") + cleanup_parser.add_argument('--dry-run', action='store_true', help="Only show which profiles would be created or deleted, don't actually change them.") + cleanup_parser.add_argument('--no-create-missing', action='store_true', help="Only delete ungrouped profiles, do not create profiles missing from Redis.") + cleanup_parser.add_argument('--disallow-cleanup-active-downloads', action='store_true', help="In paired auth/download cleanup, PREVENTS deleting a profile if its download side is ACTIVE, even if the auth side is idle. Cleanup of active downloads is allowed by default.") + # Reset global counters command reset_global_parser = subparsers.add_parser('reset-global-counters', help='Reset global counters (e.g., failed_lock_attempts).', parents=[common_parser]) @@ -1675,6 +1733,19 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups return profiles = manager.list_profiles(args.state, args.proxy) + + if getattr(args, 'hide_ungrouped', False): + all_grouped_profile_names = set() + for group in profile_groups_config: + for p_name in group.get('profiles_in_group', []): + all_grouped_profile_names.add(p_name) + + original_count = len(profiles) + profiles = [p for p in profiles if p.get('name') in all_grouped_profile_names] + filtered_count = original_count - len(profiles) + if filtered_count > 0 and not args.live: + print(f"NOTE: {filtered_count} ungrouped profiles were hidden via --hide-ungrouped.", file=sys.stderr) + if not profiles: print("No profiles found matching the criteria.", file=file) return @@ -1832,11 +1903,14 @@ def _render_simulation_view(title, manager, args, file=sys.stdout): profile_groups_config = _build_profile_groups_config(manager, profiles) - profile_selection_strategy = manager.get_config('profile_selection_strategy') - if profile_selection_strategy: - print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file) - - _render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file) + # The group summary table is only relevant for the Auth simulation, which has + # selection strategies and rotation policies. + is_auth_sim = 'Auth' in title + if is_auth_sim: + profile_selection_strategy = manager.get_config('profile_selection_strategy') + if profile_selection_strategy: + print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file) + _render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file) failed_lock_attempts = manager.get_failed_lock_attempts() global_stats = manager.get_global_stats() @@ -1997,7 +2071,6 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout): _render_activation_history_table(auth_manager, file=file) print(f"\n--- Download Simulation Profile Details ({args.download_env}) ---", file=file) - _render_profile_group_summary_table(download_manager, dl_profiles, dl_groups_config, args, file=file) _render_profile_details_table(download_manager, args, "Download", dl_groups_config, file=file) if args.show_activation_history: _render_activation_history_table(download_manager, file=file) @@ -2010,6 +2083,8 @@ def _print_profile_list(manager, args, title="Profile Status"): return _render_simulation_view(title, manager, args, file=sys.stdout) + + def main_profile_manager(args): """Main dispatcher for 'profile' command.""" if load_dotenv: @@ -2026,6 +2101,13 @@ def main_profile_manager(args): print(f"ERROR: The specified --env-file was not found: {args.env_file}", file=sys.stderr) return 1 + # After loading .env, populate any args that were not provided on the CLI + # This is necessary because argparse `default=os.getenv(...)` runs before `load_dotenv`. + if hasattr(args, 'auth_env') and args.auth_env is None: + args.auth_env = os.getenv('YTOPS_AUTH_ENV') + if hasattr(args, 'download_env') and args.download_env is None: + args.download_env = os.getenv('YTOPS_DOWNLOAD_ENV') + if args.redis_host is None: args.redis_host = os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost')) if args.redis_port is None: @@ -2190,8 +2272,46 @@ def main_profile_manager(args): sys.stdout.flush() elif args.profile_command == 'set-proxy-state': - success = manager.set_proxy_state(args.proxy_url, args.state, args.duration_minutes) - return 0 if success else 1 + proxy_urls = [p.strip() for p in args.proxy_urls.split(',') if p.strip()] + if not proxy_urls: + print("Error: No proxy URLs provided.", file=sys.stderr) + return 1 + + envs_to_process = [] + if args.auth_env: + envs_to_process.append(args.auth_env) + if args.download_env: + envs_to_process.append(args.download_env) + + if envs_to_process: + # If --auth-env or --download-env are used, operate on them. + all_success = True + for env_name in set(envs_to_process): + # When operating on specific envs, derive prefix from env name, ignoring --legacy and --key-prefix. + # This aligns with the behavior of the 'list' command in dual-mode. + key_prefix_for_env = f"{env_name}_profile_mgmt_" + print(f"--- Setting proxy state for environment: {env_name} (prefix: {key_prefix_for_env}) ---", file=sys.stderr) + env_manager = ProfileManager( + redis_host=args.redis_host, + redis_port=args.redis_port, + redis_password=args.redis_password, + key_prefix=key_prefix_for_env, + redis_db=args.redis_db + ) + for proxy_url in proxy_urls: + success = env_manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason) + if not success: + all_success = False + return 0 if all_success else 1 + else: + # Fallback to the single manager created with --env, --legacy, or --key-prefix. + # This maintains backward compatibility and handles single-environment cases. + all_success = True + for proxy_url in proxy_urls: + success = manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason) + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'get': profile = manager.get_profile(args.name) @@ -2226,31 +2346,211 @@ def main_profile_manager(args): return 0 elif args.profile_command == 'update-state': - success = manager.update_profile_state(args.name, args.state, args.reason or '') - return 0 if success else 1 + names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()] + if not names_or_patterns: + print("Error: No profile names or patterns provided.", file=sys.stderr) + return 1 + + all_profiles = manager.list_profiles() + all_profile_names = {p['name'] for p in all_profiles} + + profiles_to_update = set() + for item in names_or_patterns: + if '*' in item or '?' in item: + matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)} + if not matched_profiles: + print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr) + profiles_to_update.update(matched_profiles) + else: + if item in all_profile_names: + profiles_to_update.add(item) + else: + print(f"Warning: Profile '{item}' not found.", file=sys.stderr) + + if not profiles_to_update: + print("No matching profiles found to update.", file=sys.stderr) + return 1 + + print(f"The following {len(profiles_to_update)} profiles will be updated to state '{args.state}':") + print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key))) + confirm = input("Are you sure you want to proceed? (y/N): ") + if confirm.lower() != 'y': + print("Aborted.") + return 1 + + all_success = True + for name in sorted(list(profiles_to_update), key=natural_sort_key): + success = manager.update_profile_state(name, args.state, args.reason or '') + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'update-field': success = manager.update_profile_field(args.name, args.field, args.value) return 0 if success else 1 elif args.profile_command == 'pause': - success = manager.update_profile_state(args.name, ProfileState.PAUSED.value, 'Manual pause') - return 0 if success else 1 + names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()] + if not names_or_patterns: + print("Error: No profile names or patterns provided.", file=sys.stderr) + return 1 + + all_profiles = manager.list_profiles() + all_profile_names = {p['name'] for p in all_profiles} + + profiles_to_update = set() + for item in names_or_patterns: + if '*' in item or '?' in item: + matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)} + if not matched_profiles: + print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr) + profiles_to_update.update(matched_profiles) + else: + if item in all_profile_names: + profiles_to_update.add(item) + else: + print(f"Warning: Profile '{item}' not found.", file=sys.stderr) + + if not profiles_to_update: + print("No matching profiles found to update.", file=sys.stderr) + return 1 + + print(f"The following {len(profiles_to_update)} profiles will be PAUSED:") + print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key))) + confirm = input("Are you sure you want to proceed? (y/N): ") + if confirm.lower() != 'y': + print("Aborted.") + return 1 + + all_success = True + for name in sorted(list(profiles_to_update), key=natural_sort_key): + success = manager.update_profile_state(name, ProfileState.PAUSED.value, 'Manual pause') + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'activate': - success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual activation') - return 0 if success else 1 + names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()] + if not names_or_patterns: + print("Error: No profile names or patterns provided.", file=sys.stderr) + return 1 + + all_profiles = manager.list_profiles() + all_profile_names = {p['name'] for p in all_profiles} + + profiles_to_update = set() + for item in names_or_patterns: + if '*' in item or '?' in item: + matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)} + if not matched_profiles: + print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr) + profiles_to_update.update(matched_profiles) + else: + if item in all_profile_names: + profiles_to_update.add(item) + else: + print(f"Warning: Profile '{item}' not found.", file=sys.stderr) + + if not profiles_to_update: + print("No matching profiles found to update.", file=sys.stderr) + return 1 + + print(f"The following {len(profiles_to_update)} profiles will be ACTIVATED:") + print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key))) + confirm = input("Are you sure you want to proceed? (y/N): ") + if confirm.lower() != 'y': + print("Aborted.") + return 1 + + all_success = True + for name in sorted(list(profiles_to_update), key=natural_sort_key): + success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual activation') + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'ban': - success = manager.update_profile_state(args.name, ProfileState.BANNED.value, args.reason) - return 0 if success else 1 + names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()] + if not names_or_patterns: + print("Error: No profile names or patterns provided.", file=sys.stderr) + return 1 + + all_profiles = manager.list_profiles() + all_profile_names = {p['name'] for p in all_profiles} + + profiles_to_update = set() + for item in names_or_patterns: + if '*' in item or '?' in item: + matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)} + if not matched_profiles: + print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr) + profiles_to_update.update(matched_profiles) + else: + if item in all_profile_names: + profiles_to_update.add(item) + else: + print(f"Warning: Profile '{item}' not found.", file=sys.stderr) + + if not profiles_to_update: + print("No matching profiles found to update.", file=sys.stderr) + return 1 + + print(f"The following {len(profiles_to_update)} profiles will be BANNED:") + print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key))) + confirm = input("Are you sure you want to proceed? (y/N): ") + if confirm.lower() != 'y': + print("Aborted.") + return 1 + + all_success = True + for name in sorted(list(profiles_to_update), key=natural_sort_key): + success = manager.update_profile_state(name, ProfileState.BANNED.value, args.reason) + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'unban': - # First activate, then reset session counters. The ban reason is cleared by update_profile_state. - success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual unban') - if success: - manager.reset_profile_counters(args.name) - return 0 if success else 1 + names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()] + if not names_or_patterns: + print("Error: No profile names or patterns provided.", file=sys.stderr) + return 1 + + all_profiles = manager.list_profiles() + all_profile_names = {p['name'] for p in all_profiles} + + profiles_to_update = set() + for item in names_or_patterns: + if '*' in item or '?' in item: + matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)} + if not matched_profiles: + print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr) + profiles_to_update.update(matched_profiles) + else: + if item in all_profile_names: + profiles_to_update.add(item) + else: + print(f"Warning: Profile '{item}' not found.", file=sys.stderr) + + if not profiles_to_update: + print("No matching profiles found to update.", file=sys.stderr) + return 1 + + print(f"The following {len(profiles_to_update)} profiles will be UNBANNED:") + print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key))) + confirm = input("Are you sure you want to proceed? (y/N): ") + if confirm.lower() != 'y': + print("Aborted.") + return 1 + + all_success = True + for name in sorted(list(profiles_to_update), key=natural_sort_key): + # First activate, then reset session counters. The ban reason is cleared by update_profile_state. + success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual unban') + if success: + manager.reset_profile_counters(name) + if not success: + all_success = False + return 0 if all_success else 1 elif args.profile_command == 'delete': if not args.confirm: @@ -2267,6 +2567,9 @@ def main_profile_manager(args): print(f"Deleted {deleted_count} key(s) with prefix '{manager.key_prefix}'.") return 0 + elif args.profile_command == 'cleanup-ungrouped': + return _main_cleanup_ungrouped(args) + elif args.profile_command == 'reset-global-counters': manager.reset_global_counters() return 0 @@ -2310,3 +2613,286 @@ def main_profile_manager(args): return 0 return 1 # Should not be reached + + +def _main_cleanup_ungrouped(args): + """Handler for the 'cleanup-ungrouped' command.""" + try: + with open(args.policy_file, 'r', encoding='utf-8') as f: + policy = yaml.safe_load(f) + except (IOError, yaml.YAMLError) as e: + print(f"Error: Could not read or parse policy file '{args.policy_file}': {e}", file=sys.stderr) + return 1 + + common_pools = policy.get('common_pools', []) + if not common_pools: + print(f"Warning: No 'common_pools' found in '{args.policy_file}'. Nothing to do.", file=sys.stderr) + return 0 + + desired_profiles = {} # name -> proxy + for pool in common_pools: + prefixes = pool.get('prefixes', []) + count = pool.get('count', 0) + proxy = pool.get('proxy') + if not proxy: + print(f"Error: Pool with prefixes {prefixes} is missing a 'proxy' definition.", file=sys.stderr) + return 1 + for prefix in prefixes: + for i in range(count): + name = f"{prefix}_{i}" + desired_profiles[name] = proxy + + desired_profile_names = set(desired_profiles.keys()) + print(f"Loaded setup policy. Found {len(desired_profile_names)} desired profiles across {len(common_pools)} pools.") + + total_deleted = 0 + total_skipped = 0 + total_created = 0 + total_updated = 0 + + # --- Paired Cleanup Mode --- + if args.auth_env and args.download_env: + print("\n--- Running in Paired Cleanup Mode ---") + if not args.disallow_cleanup_active_downloads: + print("Cleanup of active download profiles is ENABLED (default). Auth profiles will be checked for idleness.") + print("If an auth profile is idle, its corresponding download profile will be removed regardless of its state.") + else: + print("Cleanup of active download profiles is DISABLED. Both auth and download profiles must be idle to be removed.") + if args.dry_run: + print("--- DRY RUN MODE: No changes will be made. ---") + + def _create_cleanup_manager(env_name): + key_prefix = f"{env_name}_profile_mgmt_" + if args.legacy: key_prefix = 'profile_mgmt_' + if args.key_prefix: key_prefix = args.key_prefix + return ProfileManager( + redis_host=args.redis_host, redis_port=args.redis_port, + redis_password=args.redis_password, key_prefix=key_prefix, + redis_db=args.redis_db + ) + + auth_manager = _create_cleanup_manager(args.auth_env) + download_manager = _create_cleanup_manager(args.download_env) + + auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()} + download_profiles_map = {p['name']: p for p in download_manager.list_profiles()} + + # --- Create Missing Profiles --- + auth_missing = desired_profile_names - set(auth_profiles_map.keys()) + download_missing = desired_profile_names - set(download_profiles_map.keys()) + + if auth_missing or download_missing: + if args.no_create_missing: + if auth_missing: print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}', creation disabled via --no-create-missing.") + if download_missing: print(f"Found {len(download_missing)} missing profiles in '{args.download_env}', creation disabled via --no-create-missing.") + else: + if auth_missing: + print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}'. Creating them...") + for name in sorted(list(auth_missing), key=natural_sort_key): + proxy = desired_profiles.get(name) + print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.auth_env}'") + if not args.dry_run: + if auth_manager.create_profile(name, proxy): total_created += 1 + if download_missing: + print(f"Found {len(download_missing)} missing profiles in '{args.download_env}'. Creating them...") + for name in sorted(list(download_missing), key=natural_sort_key): + proxy = desired_profiles.get(name) + print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.download_env}'") + if not args.dry_run: + if download_manager.create_profile(name, proxy): total_created += 1 + + # Refresh maps after creation + auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()} + download_profiles_map = {p['name']: p for p in download_manager.list_profiles()} + + # --- Update Proxies for Existing Profiles --- + auth_existing_policy_names = set(auth_profiles_map.keys()) & desired_profile_names + if auth_existing_policy_names: + print(f"\nChecking for proxy updates in '{args.auth_env}'...") + for name in sorted(list(auth_existing_policy_names), key=natural_sort_key): + current_proxy = auth_profiles_map[name].get('proxy') + desired_proxy = desired_profiles.get(name) + if current_proxy != desired_proxy and desired_proxy: + print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'") + if not args.dry_run: + if auth_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1 + + download_existing_policy_names = set(download_profiles_map.keys()) & desired_profile_names + if download_existing_policy_names: + print(f"\nChecking for proxy updates in '{args.download_env}'...") + for name in sorted(list(download_existing_policy_names), key=natural_sort_key): + current_proxy = download_profiles_map[name].get('proxy') + desired_proxy = desired_profiles.get(name) + if current_proxy != desired_proxy and desired_proxy: + print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'") + if not args.dry_run: + if download_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1 + + # --- Delete Ungrouped Profiles --- + auth_ungrouped = set(auth_profiles_map.keys()) - desired_profile_names + download_ungrouped = set(download_profiles_map.keys()) - desired_profile_names + all_ungrouped = sorted(list(auth_ungrouped | download_ungrouped), key=natural_sort_key) + + if not all_ungrouped: + print("\nNo ungrouped profiles found in either environment to clean up.") + else: + print(f"\nFound {len(all_ungrouped)} ungrouped profile(s) across both environments to consider for deletion.") + for name in all_ungrouped: + auth_profile = auth_profiles_map.get(name) + download_profile = download_profiles_map.get(name) + + can_delete_auth, auth_skip_reasons = False, [] + if auth_profile: + state = auth_profile.get('state', 'UNKNOWN') + pending_dls = auth_manager.get_pending_downloads(name) + rest_reason = auth_profile.get('rest_reason') + + is_safe = True + if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: + is_safe = False; auth_skip_reasons.append(f"auth is '{state}'") + if pending_dls > 0: + is_safe = False; auth_skip_reasons.append(f"auth has {pending_dls} pending DLs") + if rest_reason == 'waiting_downloads': + is_safe = False; auth_skip_reasons.append("auth is 'waiting_downloads'") + if is_safe: can_delete_auth = True + + can_delete_download, download_skip_reasons = False, [] + if download_profile: + state = download_profile.get('state', 'UNKNOWN') + is_safe = state not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value] + if not is_safe: download_skip_reasons.append(f"download is '{state}'") + if is_safe: can_delete_download = True + + if auth_profile and not can_delete_auth: + print(f" - SKIPPING '{name}' because its auth profile is busy ({', '.join(auth_skip_reasons)}).") + total_skipped += 1 + continue + + was_download_busy = download_profile and not can_delete_download + if auth_profile and can_delete_auth and not args.disallow_cleanup_active_downloads: + can_delete_download = True # The override rule + + if (auth_profile and can_delete_auth) or (download_profile and can_delete_download): + msgs = [] + if auth_profile and can_delete_auth: msgs.append(f"Auth (State: {auth_profile.get('state', 'N/A')})") + if download_profile and can_delete_download: + dl_msg = f"Download (State: {download_profile.get('state', 'N/A')})" + if was_download_busy: dl_msg += " <-- DELETING ACTIVE" + msgs.append(dl_msg) + + print(f" - Deleting '{name}': {'; '.join(msgs)}") + if not args.dry_run: + if auth_profile and can_delete_auth: auth_manager.delete_profile(name) + if download_profile and can_delete_download: download_manager.delete_profile(name) + total_deleted += 1 + else: + print(f" - SKIPPING '{name}' because its download profile is busy ({', '.join(download_skip_reasons)}) and no idle auth profile exists to override.") + total_skipped += 1 + # --- Single Environment Cleanup Mode --- + else: + if args.dry_run: + print("--- DRY RUN MODE: No changes will be made. ---") + + envs_to_clean = [] + if args.auth_env: envs_to_clean.append(args.auth_env) + if args.download_env: envs_to_clean.append(args.download_env) + + if not envs_to_clean: + if args.env: envs_to_clean.append(args.env) + else: + print("Error: You must specify at least one environment to clean up (e.g., --auth-env sim_auth, --download-env sim_download, or --env dev).", file=sys.stderr) + return 1 + + for env_name in envs_to_clean: + print(f"\n--- Cleaning environment: {env_name} ---") + + key_prefix = f"{env_name}_profile_mgmt_" + if args.legacy: key_prefix = 'profile_mgmt_' + if args.key_prefix: key_prefix = args.key_prefix + + manager = ProfileManager( + redis_host=args.redis_host, redis_port=args.redis_port, + redis_password=args.redis_password, key_prefix=key_prefix, + redis_db=args.redis_db + ) + + current_profiles_list = manager.list_profiles() + current_profiles_map = {p['name']: p for p in current_profiles_list} + current_profile_names = set(current_profiles_map.keys()) + ungrouped_names = current_profile_names - desired_profile_names + missing_names = desired_profile_names - current_profile_names + existing_policy_names = current_profile_names & desired_profile_names + + # --- Update Proxies for Existing Profiles --- + profiles_updated_in_env = 0 + if existing_policy_names: + print("Checking for proxy updates for existing profiles...") + for name in sorted(list(existing_policy_names), key=natural_sort_key): + current_proxy = current_profiles_map[name].get('proxy') + desired_proxy = desired_profiles.get(name) + if current_proxy != desired_proxy and desired_proxy: + print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'") + if not args.dry_run: + if manager.update_profile_field(name, 'proxy', desired_proxy): + profiles_updated_in_env += 1 + total_updated += profiles_updated_in_env + + profiles_created_in_env = 0 + if missing_names: + print(f"Found {len(missing_names)} profile(s) defined in the policy that do not exist in Redis.") + if args.no_create_missing: + print("Creation of missing profiles is disabled via --no-create-missing.") + else: + if not args.dry_run: + print("Creating missing profiles...") + + for name in sorted(list(missing_names), key=natural_sort_key): + proxy = desired_profiles.get(name) + print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}'") + if not args.dry_run: + if proxy: + if manager.create_profile(name, proxy): + profiles_created_in_env += 1 + else: + # This should not happen due to the check at the start + print(f" - SKIPPING '{name}' because its proxy could not be determined from the policy.", file=sys.stderr) + total_created += profiles_created_in_env + + if not ungrouped_names: + print("No ungrouped profiles found to clean up.") + if profiles_created_in_env == 0: + continue + + print(f"Found {len(ungrouped_names)} ungrouped profile(s) to consider for deletion.") + profiles_to_check = [p for p in current_profiles_list if p['name'] in ungrouped_names] + + for profile in sorted(profiles_to_check, key=lambda p: natural_sort_key(p['name'])): + name = profile['name'] + state = profile.get('state', 'UNKNOWN') + pending_dls = manager.get_pending_downloads(name) + + is_safe, reasons = True, [] + if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: + is_safe = False; reasons.append(f"is in '{state}' state") + if pending_dls > 0: + is_safe = False; reasons.append(f"has {pending_dls} pending download(s)") + if profile.get('rest_reason') == 'waiting_downloads': + is_safe = False; reasons.append("is in 'waiting_downloads' state") + + if is_safe: + print(f" - Deleting '{name}' (State: {state}, Pending DLs: {pending_dls})") + if not args.dry_run: manager.delete_profile(name) + total_deleted += 1 + else: + print(f" - SKIPPING '{name}' because it {', '.join(reasons)}.") + total_skipped += 1 + + print("\n--- Cleanup Summary ---") + print(f"Total profiles created: {total_created}") + print(f"Total profiles updated: {total_updated}") + print(f"Total profiles deleted: {total_deleted}") + print(f"Total profiles skipped (still active or has pending work): {total_skipped}") + if total_skipped > 0: + print("Run the cleanup command again later to remove the skipped profiles once they are idle.") + + return 0 diff --git a/ytops_client-source/ytops_client/queue_manager_tool.py b/ytops_client-source/ytops_client/queue_manager_tool.py index 731ed26..67078c6 100644 --- a/ytops_client-source/ytops_client/queue_manager_tool.py +++ b/ytops_client-source/ytops_client/queue_manager_tool.py @@ -4,6 +4,7 @@ Redis Queue Management CLI Tool for yt-ops-client. """ import argparse +import hashlib import json import logging import os @@ -23,6 +24,15 @@ except ImportError: print("'tabulate' library not found. Please install it with: pip install tabulate", file=sys.stderr) tabulate = None +try: + import yaml +except ImportError: + print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr) + yaml = None + +from pathlib import Path +import fnmatch + # Configure logging logging.basicConfig( level=logging.INFO, @@ -31,9 +41,53 @@ logging.basicConfig( logger = logging.getLogger(__name__) +def _find_configured_queues(policies_dir="policies", env=None): + """Scans YAML files in a directory to find configured queue names.""" + if not yaml: + return set() + + expected_queues = set() + policies_path = Path(policies_dir) + if not policies_path.is_dir(): + logger.debug(f"Policies directory '{policies_dir}' not found, cannot find expected queues.") + return set() + + for policy_file in policies_path.glob("*.yaml"): + try: + with open(policy_file, 'r', encoding='utf-8') as f: + policy_data = yaml.safe_load(f) + + if not isinstance(policy_data, dict): + continue + + queue_policy = policy_data.get('queue_policy') + if not isinstance(queue_policy, dict): + continue + + use_prefix = queue_policy.get('use_env_prefix', True) + prefix = "" + if use_prefix and env: + prefix = f"{env}_" + + for key, value in queue_policy.items(): + if key.endswith('_queue') and isinstance(value, str): + expected_queues.add(f"{prefix}{value}") + except (IOError, yaml.YAMLError) as e: + logger.debug(f"Could not parse policy {policy_file} to find queues: {e}") + continue + return expected_queues + + class QueueManager: """Manages Redis lists (queues).""" + def _push_state_key(self, queue_name: str, file_path: str) -> str: + """Get Redis key for storing the last pushed index for a given queue and file.""" + # Use a hash of the absolute file path to create a consistent, safe key. + abs_path = os.path.abspath(file_path) + path_hash = hashlib.sha256(abs_path.encode()).hexdigest() + return f"ytops_client:queue_push_state:{queue_name}:{path_hash}" + def __init__(self, redis_host='localhost', redis_port=6379, redis_password=None): """Initialize Redis connection.""" logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port}...") @@ -70,9 +124,28 @@ class QueueManager: """Returns the number of items in a queue.""" return self.redis.llen(queue_name) - def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int: + def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None, limit: Optional[int] = None, start_index: Optional[int] = None, auto_shift: bool = False) -> int: """Populates a queue from a file (text with one item per line, or JSON with an array of items).""" count = 0 + + # --- State management for file position --- + state_key = None + current_start_index = 0 # 0-based index + + if auto_shift: + state_key = self._push_state_key(queue_name, file_path) + last_index_str = self.redis.get(state_key) + if last_index_str: + current_start_index = int(last_index_str) + logger.info(f"Auto-shift enabled. Resuming from line {current_start_index + 1}.") + elif start_index is not None: + # CLI provides 1-based index, convert to 0-based. + current_start_index = max(0, start_index - 1) + logger.info(f"Starting from line {current_start_index + 1} as requested.") + # --- + + items_to_add = [] + total_items_in_file = 0 if file_path.lower().endswith('.json'): if wrap_key: @@ -85,25 +158,23 @@ class QueueManager: logger.error("JSON file must contain a list/array.") return 0 + total_items_in_file = len(data) + if current_start_index >= total_items_in_file: + logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} items). Nothing to push.") + return 0 + + items_to_process = data[current_start_index:] + if limit is not None and limit >= 0: + items_to_process = items_to_process[:limit] + # Items can be strings or objects. If objects, they should be converted to JSON strings. - items_to_add = [] - for item in data: + for item in items_to_process: if isinstance(item, str): items_to_add.append(item.strip()) else: items_to_add.append(json.dumps(item)) - items_to_add = [item for item in items_to_add if item] - - pipe = self.redis.pipeline() - for item in items_to_add: - pipe.rpush(queue_name, item) - count += 1 - if count > 0 and count % 1000 == 0: - pipe.execute() - logger.info(f"Pushed {count} items...") - pipe.execute() - + except (IOError, json.JSONDecodeError) as e: logger.error(f"Failed to read or parse JSON file '{file_path}': {e}") return 0 @@ -111,24 +182,51 @@ class QueueManager: logger.info("Reading items from text file (one per line).") try: with open(file_path, 'r', encoding='utf-8') as f: - pipe = self.redis.pipeline() - for line in f: - item = line.strip() - if item: - if wrap_key: - payload = json.dumps({wrap_key: item}) - else: - payload = item - pipe.rpush(queue_name, payload) - count += 1 - if count > 0 and count % 1000 == 0: - pipe.execute() - logger.info(f"Pushed {count} items...") - pipe.execute() + all_lines = f.readlines() + + total_items_in_file = len(all_lines) + if current_start_index >= total_items_in_file: + logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} lines). Nothing to push.") + return 0 + + lines_to_process = all_lines[current_start_index:] + if limit is not None and limit >= 0: + lines_to_process = lines_to_process[:limit] + + for line in lines_to_process: + item = line.strip() + if item: + if wrap_key: + payload = json.dumps({wrap_key: item}) + else: + payload = item + items_to_add.append(payload) + except IOError as e: logger.error(f"Failed to read file '{file_path}': {e}") return 0 + if items_to_add: + pipe = self.redis.pipeline() + for item in items_to_add: + pipe.rpush(queue_name, item) + count += 1 + if count > 0 and count % 1000 == 0: + pipe.execute() + logger.info(f"Pushed {count} of {len(items_to_add)} items...") + pipe.execute() + + if auto_shift and state_key: + new_index = current_start_index + count + # Don't save a new index if we've reached the end of the file. + # This allows re-running the command to start from the beginning again. + if new_index >= total_items_in_file: + self.redis.delete(state_key) + logger.info(f"Auto-shift: Reached end of file. Cleared saved position for '{os.path.basename(file_path)}'. Next run will start from the beginning.") + else: + self.redis.set(state_key, new_index) + logger.info(f"Auto-shift: Saved next start position for '{os.path.basename(file_path)}' as line {new_index + 1}.") + logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.") return count @@ -230,7 +328,11 @@ def add_queue_manager_parser(subparsers): # Push command push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser]) push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '_stress_inbox'.") - push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).') + push_parser.add_argument('--count', type=int, default=None, help='Number of items to push. For --from-file, limits the number of lines pushed. For other sources, specifies how many items to generate/push (defaults to 1).') + + shift_group = push_parser.add_mutually_exclusive_group() + shift_group.add_argument('--start', type=int, help='For --from-file, start pushing from this line number (1-based).') + shift_group.add_argument('--auto-shift', action='store_true', help="For --from-file, automatically resume from where the last push left off. State is stored in Redis.") source_group = push_parser.add_mutually_exclusive_group(required=True) source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).') @@ -285,10 +387,26 @@ def main_queue_manager(args): print(f"INFO: No queue name specified, defaulting to '{default_queue_name}' based on --env='{args.env}'.", file=sys.stderr) if args.queue_command == 'list': - queues = manager.list_queues(args.pattern) + queues_from_redis = manager.list_queues(args.pattern) + + # Discover queues from policy files + expected_queues_from_policies = _find_configured_queues(env=args.env) + + # Merge Redis results with policy-defined queues + all_queues_map = {q['name']: q for q in queues_from_redis} + + for q_name in expected_queues_from_policies: + if q_name not in all_queues_map: + # Only add if it matches the pattern filter + if fnmatch.fnmatch(q_name, args.pattern): + all_queues_map[q_name] = {'name': q_name, 'size': 0} + + queues = sorted(list(all_queues_map.values()), key=lambda x: x['name']) + if not queues: print(f"No queues found matching pattern '{args.pattern}'.") return 0 + if tabulate: print(tabulate(queues, headers='keys', tablefmt='grid')) else: @@ -309,16 +427,23 @@ def main_queue_manager(args): if not os.path.exists(args.file_path): print(f"Error: File not found at '{args.file_path}'", file=sys.stderr) return 1 - if args.count > 1: - logger.warning("--count is ignored when using --from-file.") - manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json) + manager.push_from_file( + args.queue_name, + args.file_path, + args.wrap_file_line_in_json, + limit=args.count, + start_index=args.start, + auto_shift=args.auto_shift + ) elif args.payload_json: - manager.push_static(args.queue_name, args.payload_json, args.count) + count = args.count if args.count is not None else 1 + manager.push_static(args.queue_name, args.payload_json, count) elif args.generate_payload_prefix: - if args.count <= 0: + count = args.count if args.count is not None else 1 + if count <= 0: print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr) return 1 - manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count) + manager.push_generated(args.queue_name, args.generate_payload_prefix, count) return 0 elif args.queue_command == 'clear': diff --git a/ytops_client-source/ytops_client/stress_policy/arg_parser.py b/ytops_client-source/ytops_client/stress_policy/arg_parser.py index 6fa5310..d8ed974 100644 --- a/ytops_client-source/ytops_client/stress_policy/arg_parser.py +++ b/ytops_client-source/ytops_client/stress_policy/arg_parser.py @@ -167,6 +167,7 @@ Overridable Policy Parameters via --set: parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") + parser.add_argument('--workers', type=int, help='Shortcut to override the total number of workers, capping any discovery logic.') parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.") parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.') parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.") diff --git a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py index a4fd847..f89b861 100644 --- a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py +++ b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py @@ -952,23 +952,41 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr failure_rate = dummy_settings.get('download_failure_rate', 0.0) skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) - # In dummy mode, prioritize the format from the task file, then from the policy. - format_selection = info_data.get('_ytops_download_format') - source_of_format = "task file" - if not format_selection: - format_selection = d_policy.get('formats', '') - source_of_format = "policy (download_policy.formats)" - - if not format_selection: - ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) - format_selection = ytdlp_config_overrides.get('format', '') - source_of_format = "policy (ytdlp_config_overrides.format)" + # In dummy mode, prioritize the format from the task metadata. + formats_to_test = None + source_of_format = "unknown" - if not format_selection: + # Prioritize format from task metadata, which supports per-format and per-url tasks. + metadata = info_data.get('_ytops_metadata', {}) + formats_requested = metadata.get('formats_requested') + if formats_requested is not None: + formats_to_test = formats_requested + source_of_format = "task file metadata (_ytops_metadata.formats_requested)" + + if formats_to_test is None: + # Fallback for older task formats or different workflows + format_selection_str = info_data.get('_ytops_download_format') + if format_selection_str: + source_of_format = "task file (_ytops_download_format)" + formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()] + + if formats_to_test is None: + format_selection_str = d_policy.get('formats', '') + if format_selection_str: + source_of_format = "policy (download_policy.formats)" + formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()] + + if formats_to_test is None: + ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) + format_selection_str = ytdlp_config_overrides.get('format', '') + if format_selection_str: + source_of_format = "policy (ytdlp_config_overrides.format)" + formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()] + + if formats_to_test is None: logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") formats_to_test = ['dummy_format'] else: - formats_to_test = [f.strip() for f in format_selection.split(',') if f.strip()] logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}") for format_id in formats_to_test: @@ -1022,14 +1040,47 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========") - # In dummy mode, we just rename the file to processed and continue to the finally block. + # --- Airflow Directory Logic (Dummy Mode) --- + success = downloads_processed_in_task > 0 + if success and d_policy.get('output_to_airflow_ready_dir'): + try: + video_id = info_data.get('id') + if not video_id: + logger.error(f"[{profile_name}] DUMMY: Could not find video ID in '{claimed_task_path_host.name}' for moving files.") + else: + # --- Prepare destination directory --- + now = datetime.now() + rounded_minute = (now.minute // 10) * 10 + timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}" + + base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready') + if not os.path.isabs(base_path): + base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path) + final_dir_base = os.path.join(base_path, timestamp_str) + final_dir_path = os.path.join(final_dir_base, video_id) + os.makedirs(final_dir_path, exist_ok=True) + + # --- Copy info.json --- + new_info_json_name = f"info_{video_id}.json" + dest_info_json_path = os.path.join(final_dir_path, new_info_json_name) + if not os.path.exists(dest_info_json_path): + shutil.copy(str(claimed_task_path_host), dest_info_json_path) + logger.info(f"[{profile_name}] DUMMY: Copied info.json to {dest_info_json_path}") + except Exception as e: + logger.error(f"[{profile_name}] DUMMY: Failed during post-download processing for Airflow: {e}", exc_info=True) + + # In dummy mode, we handle file cleanup and continue to the finally block. try: - base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] - processed_path = Path(f"{base_path_str}.processed") - claimed_task_path_host.rename(processed_path) - logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.") + if d_policy.get('remove_source_info_json'): + claimed_task_path_host.unlink() + logger.debug(f"DUMMY MODE: Removed processed task file '{claimed_task_path_host.name}'.") + else: + base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path_host.rename(processed_path) + logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.") except (OSError, IndexError) as e: - logger.error(f"DUMMY MODE: Failed to rename processed task file '{claimed_task_path_host}': {e}") + logger.error(f"DUMMY MODE: Failed to clean up processed task file '{claimed_task_path_host}': {e}") continue # Skip to finally block @@ -1329,15 +1380,19 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr # 6. Clean up task file if not queue_policy: - # File-based mode: rename to .processed + # File-based mode: rename to .processed or remove try: - # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed - base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] - processed_path = Path(f"{base_path_str}.processed") - claimed_task_path_host.rename(processed_path) - logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.") + if success and d_policy.get('remove_source_info_json'): + claimed_task_path_host.unlink() + logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Removed processed task file.") + else: + # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed + base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] + processed_path = Path(f"{base_path_str}.processed") + claimed_task_path_host.rename(processed_path) + logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.") except (OSError, IndexError) as e: - logger.error(f"Failed to rename processed task file '{claimed_task_path_host}': {e}") + logger.error(f"Failed to clean up processed task file '{claimed_task_path_host}': {e}") elif d_policy.get('rename_source_info_json_on_success'): # Queue-based mode: respect rename policy source_path_to_rename = task.get('info_json_path') @@ -1372,11 +1427,25 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr # but the task is being finalized, we must assume all potential downloads for this task # are "processed" to prevent the auth profile from getting stuck. if downloads_processed_in_task == 0: - logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.") - ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) - formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', '')) - num_formats = formats_str.count(',') + 1 if formats_str else 1 - downloads_processed_in_task = num_formats + logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using task metadata to determine decrement count to avoid stuck profile.") + + decrement_count = 1 # Default to 1 to be safe + metadata = info_data.get('_ytops_metadata', {}) + granularity = metadata.get('download_task_granularity') + formats_requested = metadata.get('formats_requested') # Can be None + + if granularity == 'per_format': + # Each task file represents one format group, so decrement by 1. + decrement_count = 1 + elif granularity == 'per_url': + # The task file represents all formats for a URL. + decrement_count = len(formats_requested) if formats_requested else 1 + else: + # No granularity info, this may be an older task file. + # Assume it's a single download task. + decrement_count = 1 + + downloads_processed_in_task = decrement_count logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}") if downloads_processed_in_task > 0: diff --git a/ytops_client-source/ytops_client/stress_policy/queue_workers.py b/ytops_client-source/ytops_client/stress_policy/queue_workers.py index 8fd40de..ceb892a 100644 --- a/ytops_client-source/ytops_client/stress_policy/queue_workers.py +++ b/ytops_client-source/ytops_client/stress_policy/queue_workers.py @@ -30,6 +30,180 @@ from .queue_provider import RedisQueueProvider logger = logging.getLogger(__name__) +def _process_single_auth_result(success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url, worker_id, policy, state_manager, args, profile_manager_instance, info_json_path=None): + """ + Helper to process the result of a single authentication attempt, whether it was + part of a batch or a single run. + """ + settings = policy.get('settings', {}) + queue_policy = policy.get('queue_policy', {}) + gen_policy = policy.get('info_json_generation_policy', {}) + task_granularity = gen_policy.get('download_task_granularity', 'per_format') + save_dir = settings.get('save_info_json_dir') + url = task.get('url') + + if success and info_data: + try: + auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') + is_dummy_run = args.dummy or args.dummy_batch + + # --- 1. Determine which formats will be requested --- + formats_to_download = queue_policy.get('formats_to_download') + requested_formats_list = [] + if formats_to_download: + if task_granularity == 'per_url': + format_selector = formats_to_download + if isinstance(formats_to_download, list): + format_selector = ",".join(formats_to_download) + requested_formats_list = [format_selector] + else: # per_format + formats_source = formats_to_download + if formats_source == 'from_download_policy': + formats_source = policy.get('download_policy', {}).get('formats') + + if formats_source == 'all': + requested_formats_list = [f['format_id'] for f in info_data.get('formats', [])] + elif isinstance(formats_source, list): + requested_formats_list = formats_source + elif isinstance(formats_source, str): + # A comma-separated string of format groups + requested_formats_list = [f.strip() for f in formats_source.split(',')] + elif formats_source: + # Handles integer format IDs or other non-string/list values + requested_formats_list = [str(formats_source)] + else: + # formats_source is None or empty + requested_formats_list = [] + + # --- 2. Create download tasks and save info.json(s) based on granularity --- + download_tasks_to_create = [] + created_file_paths = [] + + if task_granularity == 'per_format' and requested_formats_list: + for i, format_id in enumerate(requested_formats_list): + # Deepcopy to avoid modifying the original info_data in the loop + task_info_data = deepcopy(info_data) + + metadata = { + 'profile_name': profile_name, 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, + 'formats_requested': [format_id], # This task is for ONE format + 'download_task_granularity': task_granularity + } + if is_dummy_run: metadata['_dummy'] = True + task_info_data['_ytops_metadata'] = metadata + + final_path = None + if save_dir: + video_id = task_info_data.get('id') or task_id or 'unknown_video_id' + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' + # Sanitize format_id for use in filename + sanitized_format = re.sub(r'[^a-zA-Z0-9_-]', '_', format_id) + # Add index `i` to guarantee uniqueness even if sanitized format IDs clash + new_name = f"{video_id}-{profile_name}-{sanitized_proxy}-fmt{i}_{sanitized_format}.info.json" + final_path = os.path.join(save_dir, new_name) + with open(final_path, 'w', encoding='utf-8') as f: + json.dump(task_info_data, f, indent=2) + logger.info(f"[Worker {worker_id}] [{profile_name}] Saved format-specific info.json to '{final_path}'") + created_file_paths.append(final_path) + + download_tasks_to_create.append({ + 'info_json_path': final_path, 'format_id': format_id, + 'video_id': task_info_data.get('id'), 'url': url, + 'auth_profile_name': profile_name, 'proxy_url': proxy_url, + 'auth_env': auth_env_name, + 'original_task': task + }) + else: # per_url or default (including per_format with no formats) + metadata = { + 'profile_name': profile_name, 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, + 'formats_requested': requested_formats_list, + 'download_task_granularity': task_granularity + } + if is_dummy_run: metadata['_dummy'] = True + info_data['_ytops_metadata'] = metadata + + final_path = None + if save_dir: + video_id = info_data.get('id') or task_id or 'unknown_video_id' + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' + new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.info.json" + final_path = os.path.join(save_dir, new_name) + with open(final_path, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'") + created_file_paths.append(final_path) + + # For per_url, requested_formats_list should contain a single format selector string. + # The loop will run once, creating one task. + if requested_formats_list: + for format_selector in requested_formats_list: + download_tasks_to_create.append({ + 'info_json_path': final_path, 'format_id': format_selector, + 'video_id': info_data.get('id'), 'url': url, + 'auth_profile_name': profile_name, 'proxy_url': proxy_url, + 'auth_env': auth_env_name, + 'original_task': task + }) + else: # Handle the case where requested_formats_list is empty + logger.debug(f"[Worker {worker_id}] [{profile_name}] No formats requested, no download tasks created.") + + + profile_manager_instance.record_activity(profile_name, 'success', is_dummy=(args.dummy or args.dummy_batch)) + + # --- 3. Dispatch download tasks --- + create_download_tasks = queue_policy.get('create_download_tasks', True) + if download_tasks_to_create: + num_tasks_to_process = 0 + if create_download_tasks: + added_count = state_manager.add_download_tasks_batch(download_tasks_to_create) + logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue") + num_tasks_to_process = added_count + else: + num_tasks = len(download_tasks_to_create) + logger.info(f"[Worker {worker_id}] [{profile_name}] File-based workflow: created {num_tasks} tasks (no queue tasks created)") + num_tasks_to_process = num_tasks + + if num_tasks_to_process > 0: + profile_manager_instance.increment_pending_downloads(profile_name, count=num_tasks_to_process) + profile_manager_instance.record_predicted_downloads(profile_name, count=num_tasks_to_process, is_dummy=(args.dummy or args.dummy_batch)) + + report_payload = { + "url": url, "video_id": info_data.get('id'), "profile_name": profile_name, + "proxy_url": proxy_url, "info_json_path": created_file_paths[0] if len(created_file_paths) == 1 else created_file_paths, + "download_tasks_created": len(download_tasks_to_create) + } + if is_dummy_run: + report_payload['_dummy'] = True + state_manager.report_auth_success(task_id, report_payload) + except Exception as e: + logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True) + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) + state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url}) + else: + is_bot_error = "Sign in to confirm you're not a bot" in stderr + is_timeout_error = "Read timed out" in stderr + is_proxy_error = "ProxyError" in stderr or "Unable to connect to proxy" in stderr + is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr + is_private = "This video is private" in stderr or "Private video" in stderr + is_deleted = "This video has been removed" in stderr + is_dummy_skipped = "Dummy skipped failure" in stderr + + if is_unavailable or is_private or is_deleted or is_dummy_skipped or is_proxy_error: + reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Proxy error" if is_proxy_error else "Dummy skipped" + logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}") + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) + state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr}) + else: + error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}" + logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}") + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) + state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode}) + + def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock): """Worker function for processing authentication tasks from a queue in batches.""" owner_id = f"queue-auth-worker-{worker_id}" @@ -37,6 +211,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage exec_control = policy.get('execution_control', {}) gen_policy = policy.get('info_json_generation_policy', {}) queue_policy = policy.get('queue_policy', {}) + docker_policy = policy.get('direct_docker_cli_policy', {}) task_granularity = gen_policy.get('download_task_granularity', 'per_format') profile_prefix = gen_policy.get('profile_prefix') @@ -64,8 +239,8 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage try: # 1. Get a batch of tasks from the queue FIRST - tasks = state_manager.get_auth_tasks_batch(batch_size) - if not tasks: + raw_tasks = state_manager.get_auth_tasks_batch(batch_size) + if not raw_tasks: polling_interval = exec_control.get('worker_polling_interval_seconds', 1) base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling." if base_log_msg == last_no_task_log_msg: @@ -78,6 +253,21 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage time.sleep(polling_interval) continue + # Normalize tasks to handle both raw URL strings and JSON objects + tasks = [] + for raw_task in raw_tasks: + if isinstance(raw_task, str): + tasks.append({'url': raw_task}) + elif isinstance(raw_task, dict) and raw_task.get('url'): + tasks.append(raw_task) + else: + logger.warning(f"[Worker {worker_id}] Skipping invalid task of type {type(raw_task)}: {raw_task}") + + if not tasks: + logger.warning(f"[Worker {worker_id}] All tasks in batch were invalid. Waiting for next batch.") + time.sleep(1) + continue + if last_no_task_log_msg: print(file=sys.stderr) last_no_task_log_msg = "" @@ -110,282 +300,214 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.") - # 3. Process each task in the batch - for task in tasks: - if state_manager.shutdown_event.is_set(): - logger.info(f"[Worker {worker_id}] Shutdown requested, stopping batch processing for profile '{profile_name}'.") - break + # 3. Process the entire batch + if args.dummy or args.dummy_batch: + # Dummy mode is not optimized for batching, so we process tasks individually. + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Processing {len(tasks)} tasks individually.") + for task in tasks: + if state_manager.shutdown_event.is_set(): + break - temp_task_dir = None - task_id = None - url = None - try: - temp_task_dir = tempfile.mkdtemp(prefix=f"queue-auth-{worker_id}-") - task_id = task.get('id') or task.get('task_id') - if not task_id: - task_id = f"task_{worker_id}_{task_counter}" - task_counter += 1 - task['task_id'] = task_id - + task_id = task.get('id') or task.get('task_id') or f"task_{worker_id}_{task_counter}" url = task.get('url') - if not url: - logger.error(f"[Worker {worker_id}] Task {task_id} has no URL. Skipping.") - state_manager.report_auth_skipped(task_id, {"error": "No URL in task", "task": task}) - continue - logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task {task_id}: {url}") - state_manager.mark_auth_in_progress(task_id, owner_id) - - # --- Main processing logic for a single task --- + # Dummy simulation logic copied from single-task implementation + logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}") + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) + min_seconds = dummy_settings.get('auth_min_seconds', 0.1) + max_seconds = dummy_settings.get('auth_max_seconds', 0.5) + failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0) + skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0) + time.sleep(random.uniform(min_seconds, max_seconds)) + success, info_data, stderr, retcode = False, None, "", 0 - if args.dummy or args.dummy_batch: - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}") - dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) - min_seconds = dummy_settings.get('auth_min_seconds', 0.1) - max_seconds = dummy_settings.get('auth_max_seconds', 0.5) - failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0) - skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0) - time.sleep(random.uniform(min_seconds, max_seconds)) + rand_val = random.random() + if rand_val < skipped_rate: + stderr = "Dummy skipped failure" + elif rand_val < (skipped_rate + failure_rate): + stderr = "Dummy fatal failure" + else: + success = True + video_id = sp_utils.get_video_id(url) or task_id + info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]} - rand_val = random.random() - if rand_val < skipped_rate: - stderr = "Dummy skipped failure" - elif rand_val < (skipped_rate + failure_rate): - stderr = "Dummy fatal failure" - else: - success = True - # In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id' + _process_single_auth_result( + success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url, + worker_id, policy, state_manager, args, profile_manager_instance + ) + else: + # BATCH MODE: Process all tasks in a single Docker run + batch_temp_dir = None + try: + # Create a temporary directory for this batch's files + host_mount_path = docker_policy.get('docker_host_mount_path') + temp_parent_dir = host_mount_path if host_mount_path and os.path.isdir(host_mount_path) else None + batch_temp_dir = tempfile.mkdtemp(prefix=f"queue-auth-batch-{worker_id}-", dir=temp_parent_dir) + task_dir_name = os.path.basename(batch_temp_dir) + task_dir_container = os.path.join(docker_policy.get('docker_container_mount_path'), task_dir_name) + + # Map URLs to their original tasks for post-processing + url_to_task_map = {task.get('url'): task for task in tasks if task.get('url')} + + # Create a batch file with all URLs + batch_file_host_path = os.path.join(batch_temp_dir, 'batch_urls.txt') + with open(batch_file_host_path, 'w', encoding='utf-8') as f: + for url in url_to_task_map.keys(): + f.write(f"{url}\n") + + # Mark all tasks in the batch as in-progress + for task in tasks: + task_id = task.get('id') or task.get('task_id') + if not task_id: + # Generate a temporary, worker-unique ID if none exists. + task_id = f"task_{worker_id}_{int(time.time()*1000)}_{task_counter}" + task['task_id'] = task_id # Store it back for later stages + task_counter += 1 + logger.warning(f"[Worker {worker_id}] Task missing 'id' or 'task_id'. Generated a temporary one: {task_id}. Task content: {task}") + state_manager.mark_auth_in_progress(task_id, owner_id) + + # Docker batch processing logic + image_name = docker_policy.get('docker_image_name') + if not image_name: + raise ValueError("'direct_docker_cli_policy.docker_image_name' is not defined.") + + network_name = docker_policy.get('docker_network_name') + container_mount_path = docker_policy.get('docker_container_mount_path') + host_cache_path = docker_policy.get('docker_host_cache_path') + container_cache_path = docker_policy.get('docker_container_cache_path') + + volumes = {} + if host_mount_path and container_mount_path: + volumes[str(Path(host_mount_path).resolve())] = {'bind': container_mount_path, 'mode': 'rw'} + if host_cache_path and container_cache_path: + Path(host_cache_path).mkdir(parents=True, exist_ok=True) + volumes[str(Path(host_cache_path).resolve())] = {'bind': container_cache_path, 'mode': 'rw'} + + environment = {} + if host_cache_path and container_cache_path: + environment['XDG_CACHE_HOME'] = container_cache_path + + # --- Build config file content --- + base_config_content = "" + base_config_file = docker_policy.get('ytdlp_config_file') + if base_config_file: + config_path_to_read = Path(base_config_file) + if not config_path_to_read.exists(): + config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file + + if config_path_to_read.exists(): try: - # First, try the robust library function - video_id = sp_utils.get_video_id(url) - if not video_id and url: - # Fallback parsing for dummy URLs like "...?v=dummy_0099" - parsed_url = urlparse(url) - video_id = parse_qs(parsed_url.query).get('v', [None])[0] - - # Additional fallback for URLs like "youtube.com/watch?v=dummy_0028" - if not video_id and 'dummy_' in url: - dummy_match = re.search(r'dummy_\d+', url) - if dummy_match: - video_id = dummy_match.group(0) - except Exception: - video_id = None # Ensure video_id is None on parsing failure - - # If all extraction methods fail, use the task_id as a reliable fallback. - if not video_id: - logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.") - video_id = task_id - - info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]} - else: - client, req_params = state_manager.get_client_for_request(profile_name, gen_policy) - cmd = [ - sys.executable, '-m', 'ytops_client.cli', 'get-info', - '--client', client or '', '--profile', profile_name - ] - if proxy_url: - cmd.extend(['--proxy', proxy_url]) - if req_params: - cmd.extend(['--request-params', json.dumps(req_params)]) - extra_args = gen_policy.get('extra_args') - if extra_args: - cmd.extend(shlex.split(extra_args)) - - # The URL must be the last positional argument - cmd.append(url) - - logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") - retcode, stdout, stderr = run_command( - cmd, running_processes, process_lock, stream_output=args.verbose, - stream_prefix=f"[Worker {worker_id} | get-info] " - ) - success = (retcode == 0) - if success: - info_json_path = next((line.strip() for line in stdout.strip().split('\n') if line.endswith('.json') and os.path.exists(line.strip())), None) - if info_json_path: - try: - with open(info_json_path, 'r', encoding='utf-8') as f: - info_data = json.load(f) - except (IOError, json.JSONDecodeError) as e: - logger.error(f"[Worker {worker_id}] Failed to read/parse info.json from get-info: {e}") - success = False - stderr += f"\nFailed to read/parse info.json: {e}" - else: - logger.error(f"[Worker {worker_id}] Command succeeded but no info.json path found in output.") - success = False - stderr += "\nNo info.json path in output" - - # --- Result processing for a single task --- - if success and info_data: - try: - auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - is_dummy_run = args.dummy or args.dummy_batch - - # --- 1. Determine which formats will be requested --- - formats_to_download = queue_policy.get('formats_to_download') - requested_formats_list = [] - if formats_to_download: - if task_granularity == 'per_url': - format_selector = formats_to_download - if isinstance(formats_to_download, list): - format_selector = ",".join(formats_to_download) - requested_formats_list = [format_selector] - else: # per_format - formats_source = formats_to_download - if formats_source == 'from_download_policy': - formats_source = policy.get('download_policy', {}).get('formats') - - if formats_source == 'all': - requested_formats_list = [f['format_id'] for f in info_data.get('formats', [])] - elif isinstance(formats_source, list): - requested_formats_list = formats_source - elif isinstance(formats_source, str): - # A comma-separated string of format groups - requested_formats_list = [f.strip() for f in formats_source.split(',')] - elif formats_source: - # Handles integer format IDs or other non-string/list values - requested_formats_list = [str(formats_source)] - else: - # formats_source is None or empty - requested_formats_list = [] - - # --- 2. Create download tasks and save info.json(s) based on granularity --- - download_tasks_to_create = [] - created_file_paths = [] - - if task_granularity == 'per_format' and requested_formats_list: - for i, format_id in enumerate(requested_formats_list): - # Deepcopy to avoid modifying the original info_data in the loop - task_info_data = deepcopy(info_data) - - metadata = { - 'profile_name': profile_name, 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, - 'formats_requested': [format_id], # This task is for ONE format - 'download_task_granularity': task_granularity - } - if is_dummy_run: metadata['_dummy'] = True - task_info_data['_ytops_metadata'] = metadata - - final_path = None - if save_dir: - video_id = task_info_data.get('id') or task_id or 'unknown_video_id' - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' - # Sanitize format_id for use in filename - sanitized_format = re.sub(r'[^a-zA-Z0-9_-]', '_', format_id) - # Add index `i` to guarantee uniqueness even if sanitized format IDs clash - new_name = f"{video_id}-{profile_name}-{sanitized_proxy}-fmt{i}_{sanitized_format}.info.json" - final_path = os.path.join(save_dir, new_name) - with open(final_path, 'w', encoding='utf-8') as f: - json.dump(task_info_data, f, indent=2) - logger.info(f"[Worker {worker_id}] [{profile_name}] Saved format-specific info.json to '{final_path}'") - created_file_paths.append(final_path) - - download_tasks_to_create.append({ - 'info_json_path': final_path, 'format_id': format_id, - 'video_id': task_info_data.get('id'), 'url': url, - 'auth_profile_name': profile_name, 'proxy_url': proxy_url, - 'auth_env': auth_env_name, - 'original_task': task - }) - else: # per_url or default (including per_format with no formats) - metadata = { - 'profile_name': profile_name, 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, - 'formats_requested': requested_formats_list, - 'download_task_granularity': task_granularity - } - if is_dummy_run: metadata['_dummy'] = True - info_data['_ytops_metadata'] = metadata - - final_path = None - if save_dir: - video_id = info_data.get('id') or task_id or 'unknown_video_id' - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' - new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.info.json" - final_path = os.path.join(save_dir, new_name) - with open(final_path, 'w', encoding='utf-8') as f: - json.dump(info_data, f, indent=2) - logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'") - created_file_paths.append(final_path) - - # For per_url, requested_formats_list should contain a single format selector string. - # The loop will run once, creating one task. - if requested_formats_list: - for format_selector in requested_formats_list: - download_tasks_to_create.append({ - 'info_json_path': final_path, 'format_id': format_selector, - 'video_id': info_data.get('id'), 'url': url, - 'auth_profile_name': profile_name, 'proxy_url': proxy_url, - 'auth_env': auth_env_name, - 'original_task': task - }) - else: # Handle the case where requested_formats_list is empty - logger.debug(f"[Worker {worker_id}] [{profile_name}] No formats requested, no download tasks created.") - - - profile_manager_instance.record_activity(profile_name, 'success', is_dummy=(args.dummy or args.dummy_batch)) - - # --- 3. Dispatch download tasks --- - create_download_tasks = queue_policy.get('create_download_tasks', True) - if download_tasks_to_create: - num_tasks_to_process = 0 - if create_download_tasks: - added_count = state_manager.add_download_tasks_batch(download_tasks_to_create) - logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue") - num_tasks_to_process = added_count - else: - num_tasks = len(download_tasks_to_create) - logger.info(f"[Worker {worker_id}] [{profile_name}] File-based workflow: created {num_tasks} tasks (no queue tasks created)") - num_tasks_to_process = num_tasks - - if num_tasks_to_process > 0: - profile_manager_instance.increment_pending_downloads(profile_name, count=num_tasks_to_process) - profile_manager_instance.record_predicted_downloads(profile_name, count=num_tasks_to_process, is_dummy=(args.dummy or args.dummy_batch)) - - report_payload = { - "url": url, "video_id": info_data.get('id'), "profile_name": profile_name, - "proxy_url": proxy_url, "info_json_path": created_file_paths[0] if len(created_file_paths) == 1 else created_file_paths, - "download_tasks_created": len(download_tasks_to_create) - } - if is_dummy_run: - report_payload['_dummy'] = True - state_manager.report_auth_success(task_id, report_payload) - except Exception as e: - logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True) - profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) - state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url}) - else: - is_bot_error = "Sign in to confirm you're not a bot" in stderr - is_timeout_error = "Read timed out" in stderr - is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr - is_private = "This video is private" in stderr or "Private video" in stderr - is_deleted = "This video has been removed" in stderr - is_dummy_skipped = "Dummy skipped failure" in stderr - - if is_unavailable or is_private or is_deleted or is_dummy_skipped: - reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped" - logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) - state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr}) + with open(config_path_to_read, 'r', encoding='utf-8') as f: + base_config_content = f.read() + except IOError as e: + logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") else: - error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}" - logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}") - profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) - state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode}) + logger.error(f"[Worker {worker_id}] Could not find ytdlp_config_file: '{base_config_file}'") + + config_overrides = docker_policy.get('ytdlp_config_overrides', {}).copy() + config_overrides['proxy'] = proxy_url + config_overrides['batch-file'] = os.path.join(task_dir_container, 'batch_urls.txt') + config_overrides['output'] = os.path.join(task_dir_container, '%(id)s') + + overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) + raw_args_from_policy = docker_policy.get('ytdlp_raw_args', []) + raw_args_content = '\n'.join(raw_args_from_policy) + + config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}" + if raw_args_content: + config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}" + + # Write the combined config to a file in the temp batch dir + ytdlp_config_dir_host = os.path.join(batch_temp_dir, 'yt-dlp') + os.makedirs(ytdlp_config_dir_host, exist_ok=True) + temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config') + with open(temp_config_file_host, 'w', encoding='utf-8') as f: + f.write(config_content) + + # The command tells yt-dlp exactly where to find the config file we created. + cmd = ['yt-dlp', '--config-locations', os.path.join(task_dir_container, 'yt-dlp/config')] + + processed_urls = set() + activity_lock = threading.Lock() + + def log_parser_callback(line): + # Not thread-safe, but only one thread writes to this set + nonlocal processed_urls + + if '[info] Writing video metadata as JSON to:' in line: + try: + path_match = re.search(r"Writing video metadata as JSON to: '?([^']+)'?$", line) + if not path_match: + path_match = re.search(r"Writing video metadata as JSON to: (.*)$", line) + + if path_match: + container_file_path = path_match.group(1).strip() + if container_file_path.startswith(container_mount_path): + relative_path = os.path.relpath(container_file_path, container_mount_path) + host_file_path = os.path.join(host_mount_path, relative_path) + + # Wait for file to exist + for _ in range(5): + if os.path.exists(host_file_path): break + time.sleep(0.1) + + if os.path.exists(host_file_path): + with open(host_file_path, 'r', encoding='utf-8') as f: + info_data = json.load(f) + + original_url = info_data.get('original_url') or info_data.get('webpage_url') + if original_url in url_to_task_map: + with activity_lock: + if original_url not in processed_urls: + processed_urls.add(original_url) + task = url_to_task_map[original_url] + task_id = task.get('id') or task.get('task_id') + _process_single_auth_result( + True, info_data, "", 0, task, task_id, profile_name, proxy_url, + worker_id, policy, state_manager, args, profile_manager_instance, + info_json_path=host_file_path + ) + else: + logger.warning(f"[Worker {worker_id}] Could not map info.json '{Path(host_file_path).name}' with URL '{original_url}' to an original task. Skipping.") + except (IOError, json.JSONDecodeError, KeyError) as e: + logger.error(f"[Worker {worker_id}] Error during live post-processing from log line: {e}") + return False # Don't stop the container + + logger.info(f"[Worker {worker_id}] [{profile_name}] Running Docker container for batch of {len(tasks)} URLs.") + retcode, stdout, stderr, _ = run_docker_container( + image_name=image_name, command=cmd, volumes=volumes, + environment=environment, network_name=network_name, + stream_prefix=f"[Worker {worker_id} | Docker] ", + log_callback=log_parser_callback + ) + + # --- Post-processing for failed URLs --- + unprocessed_urls = set(url_to_task_map.keys()) - processed_urls + if unprocessed_urls: + logger.warning(f"[Worker {worker_id}] [{profile_name}] {len(unprocessed_urls)} URLs from the batch did not produce an info.json and will be marked as failed.") + for url in unprocessed_urls: + task = url_to_task_map[url] + task_id = task.get('id') or task.get('task_id') + _process_single_auth_result( + False, None, stderr, retcode, task, task_id, profile_name, proxy_url, + worker_id, policy, state_manager, args, profile_manager_instance + ) except Exception as e: - logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True) - if task_id: - state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"}) - profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) + logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error during batch processing: {e}", exc_info=True) + for task in tasks: + task_id = task.get('id') or task.get('task_id') + _process_single_auth_result( + False, None, f"Unexpected batch error: {str(e)}", -1, task, task_id, profile_name, proxy_url, + worker_id, policy, state_manager, args, profile_manager_instance + ) finally: - if temp_task_dir and os.path.exists(temp_task_dir): - shutil.rmtree(temp_task_dir) - if task_id: - state_manager.remove_auth_in_progress(task_id) + if batch_temp_dir and os.path.exists(batch_temp_dir): + shutil.rmtree(batch_temp_dir) + for task in tasks: + task_id = task.get('id') or task.get('task_id') + if task_id: + state_manager.remove_auth_in_progress(task_id) except Exception as e: logger.error(f"[Worker {worker_id}] Unexpected error in outer worker loop: {e}", exc_info=True) @@ -437,24 +559,41 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma settings = policy.get('settings', {}) exec_control = policy.get('execution_control', {}) d_policy = policy.get('download_policy', {}) + direct_policy = policy.get('direct_docker_cli_policy', {}) queue_policy = policy.get('queue_policy', {}) profile_prefix = d_policy.get('profile_prefix') if not profile_prefix: - logger.error(f"[Worker {worker_id}] Queue download mode requires 'download_policy.profile_prefix'. Worker exiting.") + logger.error(f"[Worker {worker_id}] Queue download mode requires a profile prefix, but none was provided to the worker. Check 'execution_control' in policy. Worker exiting.") return [] - output_dir = d_policy.get('output_dir') - if output_dir: - os.makedirs(output_dir, exist_ok=True) - logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'") + # --- Docker specific config for download worker --- + image_name = direct_policy.get('docker_image_name') + host_mount_path = direct_policy.get('docker_host_mount_path') + container_mount_path = direct_policy.get('docker_container_mount_path') + host_download_path = direct_policy.get('docker_host_download_path') + container_download_path = direct_policy.get('docker_container_download_path') + network_name = direct_policy.get('docker_network_name') + + if not all([image_name, host_mount_path, container_mount_path, host_download_path, container_download_path]): + logger.error(f"[Worker {worker_id}] Queue docker download mode requires all docker_* keys in 'direct_docker_cli_policy'. Worker exiting.") + return [] + + try: + os.makedirs(host_mount_path, exist_ok=True) + os.makedirs(host_download_path, exist_ok=True) + except OSError as e: + logger.error(f"[Worker {worker_id}] Could not create required host directories: {e}. Worker exiting.") + return [] task_counter = 0 last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None - temp_task_dir = None + temp_task_dir = None # Used by dummy mode + temp_config_dir_host = None # Used by docker mode + was_banned_by_parser = False task = None auth_profile_name, auth_env = None, None @@ -594,105 +733,132 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma # race conditions between workers processing different formats for the # same video. The files can be cleaned up manually between runs. else: - cmd = [ - sys.executable, '-m', 'ytops_client.cli', 'download', 'py', - '--load-info-json', info_json_path, - '-f', format_id - ] - if proxy_url: - cmd.extend(['--proxy', proxy_url]) - if output_dir: - cmd.extend(['--output-dir', output_dir]) - extra_args = d_policy.get('extra_args') - if extra_args: - cmd.extend(shlex.split(extra_args)) + retcode = -1; stdout = ""; stderr = ""; stop_reason = None + live_success_count, live_failure_count, live_tolerated_count = 0, 0, 0 - logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") - retcode, stdout, stderr = run_command( - cmd, running_processes, process_lock, stream_output=args.verbose, - stream_prefix=f"[Worker {worker_id} | download] " - ) - success = (retcode == 0) - if success: - for line in stdout.strip().split('\n'): - if os.path.exists(line.strip()): - downloaded_filepath = line.strip() - break + # --- Pre-flight checks --- + if d_policy.get('check_url_expiration', True): + try: + with open(info_json_path, 'r', encoding='utf-8') as f: info_data = json.load(f) + first_format = next((f for f in info_data.get('formats', []) if 'url' in f), None) + if first_format: + status, _ = sp_utils.check_url_expiry(first_format['url'], d_policy.get('expire_time_shift_minutes', 0)) + if status == 'expired': stderr = "Download URL is expired" + except (IOError, json.JSONDecodeError) as e: stderr = f"Could not read info.json: {e}" + + if not stderr: + relative_task_path = os.path.relpath(os.path.abspath(info_json_path), host_mount_path) + task_path_container = os.path.join(container_mount_path, relative_task_path) + temp_config_dir_host = tempfile.mkdtemp(prefix=f"docker-dl-config-{worker_id}-", dir=host_mount_path) + config_dir_container = os.path.join(container_mount_path, os.path.basename(temp_config_dir_host)) + + base_config_content = "" + base_config_file = direct_policy.get('ytdlp_config_file') + if base_config_file: + config_path_to_read = Path(base_config_file) + if not config_path_to_read.exists(): config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file + if config_path_to_read.exists(): + try: + with open(config_path_to_read, 'r', encoding='utf-8') as f: base_config_content = f.read() + except IOError as e: logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}") + + config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy() + config_overrides.update({ + 'proxy': proxy_url, 'load-info-json': task_path_container, + 'output': os.path.join(container_download_path, '%(id)s.f%(format_id)s.%(ext)s'), + 'format': format_id, 'no-cache-dir': True + }) + overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides) + raw_args_content = '\n'.join(direct_policy.get('ytdlp_raw_args', [])) + config_content = f"{base_config_content.strip()}\n\n# --- Overrides ---\n{overrides_content}\n{raw_args_content}" + + ytdlp_config_dir_host = os.path.join(temp_config_dir_host, 'yt-dlp') + os.makedirs(ytdlp_config_dir_host, exist_ok=True) + with open(os.path.join(ytdlp_config_dir_host, 'config'), 'w', encoding='utf-8') as f: f.write(config_content) + + volumes = { + os.path.abspath(host_mount_path): {'bind': container_mount_path, 'mode': 'ro'}, + os.path.abspath(host_download_path): {'bind': container_download_path, 'mode': 'rw'} + } + command = ['yt-dlp', '--config-locations', os.path.join(config_dir_container, 'yt-dlp/config')] + + activity_lock = threading.Lock() + tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', []) + fatal_error_patterns = direct_policy.get('fatal_error_patterns', []) + + def log_parser_callback(line): + nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser + if '[download] 100% of' in line or 'has already been downloaded' in line: + with activity_lock: live_success_count += 1; profile_manager_instance.record_activity(profile_name, 'download', is_dummy=False) + return False + for pattern in fatal_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: + live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False) + if direct_policy.get('ban_on_fatal_error_in_batch'): + profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download') + was_banned_by_parser = True; return True + return False + if 'ERROR:' not in line: return False + for pattern in tolerated_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + with activity_lock: live_tolerated_count += 1; profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=False) + return False + with activity_lock: live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False) + return False + + retcode, stdout, stderr, stop_reason = run_docker_container( + image_name=image_name, command=command, volumes=volumes, stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ", + network_name=network_name, log_callback=log_parser_callback, profile_manager=profile_manager_instance, + profile_name=profile_name, environment={} + ) + + success = live_success_count > 0 or (retcode == 0 and not stop_reason and live_failure_count == 0) if success: - # --- Airflow Integration --- + downloaded_filepath = None if d_policy.get('output_to_airflow_ready_dir'): base_path = d_policy.get('airflow_ready_dir_base_path') if not base_path: - logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set. Skipping Airflow output.") + logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set.") else: try: - # Create a unique, timestamped directory for the video ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S_%f') final_dir = os.path.join(base_path, ts + '_' + (video_id or 'unknown_video')) os.makedirs(final_dir, exist_ok=True) - - # Copy info.json to avoid a race condition where multiple download workers - # for different formats try to move the same source file. if info_json_path and os.path.exists(info_json_path): - final_info_json_path = os.path.join(final_dir, os.path.basename(info_json_path)) - shutil.copy(info_json_path, final_info_json_path) - logger.info(f"[Worker {worker_id}] Copied info.json to Airflow-ready dir: {final_info_json_path}") - # Update the path for reporting to point to the new copy - info_json_path = final_info_json_path + shutil.copy(info_json_path, os.path.join(final_dir, os.path.basename(info_json_path))) - # Move downloaded file (if not in dummy mode) - if downloaded_filepath and os.path.exists(downloaded_filepath): - final_media_path = os.path.join(final_dir, os.path.basename(downloaded_filepath)) - shutil.move(downloaded_filepath, final_media_path) - logger.info(f"[Worker {worker_id}] Moved media file to Airflow-ready dir: {final_media_path}") - # Update the path for reporting - downloaded_filepath = final_media_path + downloaded_files = [f for f in os.listdir(host_download_path) if not f.endswith(('.part', '.ytdl'))] + if not downloaded_files: logger.warning(f"No media files found in '{host_download_path}' for video '{video_id}'.") + + for filename in downloaded_files: + final_media_path = os.path.join(final_dir, filename) + shutil.move(os.path.join(host_download_path, filename), final_media_path) + if downloaded_filepath is None: downloaded_filepath = final_media_path + except Exception as e: logger.error(f"Failed to move files to Airflow-ready dir: {e}", exc_info=True) - except Exception as e: - logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True) - - profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_success(task_id, { - "video_id": video_id, "url": url, "format_id": format_id, - "profile_name": profile_name, "proxy_url": proxy_url, + "video_id": video_id, "url": url, "format_id": format_id, "profile_name": profile_name, "proxy_url": proxy_url, "downloaded_filepath": downloaded_filepath, "info_json_path": info_json_path }) logger.info(f"[Worker {worker_id}] Download successful: {video_id or url} format {format_id}") - # --- Rename source info.json to mark as processed --- if d_policy.get('rename_source_info_json_on_success'): - # Use the original path from the task, as the `info_json_path` variable - # may have been updated by the Airflow logic to point to a copy. - source_path_to_rename = task.get('info_json_path') - if source_path_to_rename and os.path.exists(source_path_to_rename): - try: - processed_path = source_path_to_rename + ".processed" - shutil.move(source_path_to_rename, processed_path) - logger.info(f"[Worker {worker_id}] Renamed source info.json to '{processed_path}'") - except Exception as e: - logger.warning(f"[Worker {worker_id}] Could not rename source info.json '{source_path_to_rename}': {e}") + source_path = task.get('info_json_path') + if source_path and os.path.exists(source_path): + try: shutil.move(source_path, source_path + ".processed") + except Exception as e: logger.warning(f"Could not rename source info.json '{source_path}': {e}") else: - is_bot_error = "Sign in to confirm you're not a bot" in stderr - is_timeout_error = "Read timed out" in stderr - is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr + is_unavailable = "unavailable" in stderr.lower() or "expired" in stderr.lower() is_format_error = "requested format not available" in stderr - if is_unavailable or is_format_error: - logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) - state_manager.report_download_skipped(task_id, { - "video_id": video_id, "url": url, "format_id": format_id, - "reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr - }) + reason = "Video/URL unavailable or expired" if is_unavailable else "Format not available" + state_manager.report_download_skipped(task_id, {"video_id": video_id, "url": url, "format_id": format_id, "reason": reason, "stderr": stderr}) else: - error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}" - logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}") - profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_failure(task_id, { - "video_id": video_id, "url": url, "format_id": format_id, - "error_type": error_type, "stderr": stderr, "exit_code": retcode, - "original_task": task + "video_id": video_id, "url": url, "format_id": format_id, "error_type": f"Exit code {retcode}", + "stderr": stderr, "exit_code": retcode, "original_task": task }) except Exception as e: @@ -722,31 +888,37 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.") if locked_profile: - cooldown = None - cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') - if cooldown_config: - try: - val = json.loads(cooldown_config) - if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: - cooldown = random.randint(val[0], val[1]) - elif isinstance(val, int): - cooldown = val - except (json.JSONDecodeError, TypeError): - if cooldown_config.isdigit(): - cooldown = int(cooldown_config) + if was_banned_by_parser: + logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was banned by log parser. Skipping unlock.") + else: + cooldown = None + cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') + if cooldown_config: + try: + val = json.loads(cooldown_config) + if isinstance(val, list) and len(val) == 2 and val[0] < val[1]: + cooldown = random.randint(val[0], val[1]) + elif isinstance(val, int): + cooldown = val + except (json.JSONDecodeError, TypeError): + if isinstance(cooldown_config, str) and cooldown_config.isdigit(): + cooldown = int(cooldown_config) - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + if cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") - profile_manager_instance.unlock_profile( - locked_profile['name'], - owner=owner_id, - rest_for_seconds=cooldown - ) + profile_manager_instance.unlock_profile( + locked_profile['name'], + owner=owner_id, + rest_for_seconds=cooldown + ) if temp_task_dir and os.path.exists(temp_task_dir): shutil.rmtree(temp_task_dir) + if temp_config_dir_host and os.path.exists(temp_config_dir_host): + shutil.rmtree(temp_config_dir_host) + if task and task_id: state_manager.remove_download_in_progress(task_id) diff --git a/ytops_client-source/ytops_client/stress_policy_tool.py b/ytops_client-source/ytops_client/stress_policy_tool.py index 4db8ed4..5d7362b 100644 --- a/ytops_client-source/ytops_client/stress_policy_tool.py +++ b/ytops_client-source/ytops_client/stress_policy_tool.py @@ -130,6 +130,47 @@ process_lock = threading.Lock() logger = logging.getLogger('stress_policy_tool') +def _discover_worker_pools(discovery_config, manager_for_discovery): + """ + Discovers worker pools by scanning profile prefixes in Redis. + Returns a list of worker pool configurations or None on error. + """ + discovery_pattern = discovery_config.get('profile_prefix_pattern') + workers_per_group = discovery_config.get('workers_per_profile_group', 1) + + if not discovery_pattern: + logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") + return None + + logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") + try: + all_profiles = manager_for_discovery.list_profiles() + found_prefixes = set() + for profile in all_profiles: + profile_name = profile['name'] + if fnmatch.fnmatch(profile_name, discovery_pattern): + # Assuming standard name format like 'user31_001', extract 'user31' + prefix = profile_name.rsplit('_', 1)[0] + found_prefixes.add(prefix) + + if not found_prefixes: + logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") + return [] + else: + worker_pools = [] + for prefix in sorted(list(found_prefixes)): + worker_pools.append({ + 'profile_prefix': prefix, + 'workers': workers_per_group + }) + logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") + logger.info("Note: Profile group discovery runs once at startup. A restart is required to detect new profile groups.") + return worker_pools + except Exception as e: + logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) + return None + + def main_stress_policy(args): """Main logic for the 'stress-policy' command.""" if args.list_policies: @@ -362,10 +403,21 @@ def main_stress_policy(args): logger.info(f"\nSignal {signum} received, shutting down gracefully...") shutdown_event.set() + # Propagate signal to child processes to allow them to shut down gracefully. + with process_lock: + if running_processes: + logger.info(f"Propagating signal to {len(running_processes)} running subprocess(es)...") + for p in running_processes: + try: + # Send the same signal to the entire process group. + os.killpg(os.getpgid(p.pid), signum) + except (ProcessLookupError, PermissionError): + pass # Process already finished or we lack permissions + # Save state immediately to prevent loss on interrupt. logger.info("Attempting to save state before shutdown...") state_manager.close() - logger.info("Shutdown requested. Allowing in-progress tasks to complete. No new tasks will be started. Press Ctrl+C again to force exit.") + logger.info("Shutdown requested. Signalling in-progress tasks to terminate gracefully. No new tasks will be started. Press Ctrl+C again to force exit.") else: logger.info("Second signal received, forcing exit.") # On second signal, forcefully terminate subprocesses. @@ -579,6 +631,7 @@ def main_stress_policy(args): logger.info(f"Starting/resuming from URL index {start_index + 1}.") # The worker's get_next_url_batch will respect this starting index. + logger.info(f"Task source file: {os.path.abspath(urls_file)}") sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) if args.dry_run: return 0 @@ -586,13 +639,8 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") direct_policy = policy.get('direct_batch_cli_policy', {}) use_env = direct_policy.get('use_profile_env', 'auth') manager_for_discovery = profile_managers.get(use_env) @@ -601,35 +649,10 @@ def main_stress_policy(args): logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred during discovery + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -641,29 +664,65 @@ def main_stress_policy(args): return 1 if args.profile_prefix: - logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.") + cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()} + + original_pool_count = len(worker_pools) + filtered_pools = [] for pool in worker_pools: - pool['profile_prefix'] = args.profile_prefix + pool_prefixes_str = pool.get('profile_prefix', '') + if not pool_prefixes_str: + continue + pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()} + if pool_prefixes.intersection(cli_prefixes): + filtered_pools.append(pool) + + if len(filtered_pools) < original_pool_count: + logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.") + + worker_pools = filtered_pools + + if not worker_pools: + logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.") worker_specs = [] - worker_id_counter = 0 - for pool in worker_pools: - pool_workers = pool.get('workers', 1) - prefix_str = pool.get('profile_prefix', '') - prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] - if not prefixes: - logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") - continue + if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1: + logger.info("Single worker mode: aggregating all discovered profile groups for the worker.") + all_prefixes = [] + for pool in worker_pools: + prefix_str = pool.get('profile_prefix', '') + if prefix_str: + all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip()) + final_prefix_str = ','.join(sorted(list(set(all_prefixes)))) + logger.info(f"Single worker will manage profile groups: {final_prefix_str}") + + worker_policy = deepcopy(policy) + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str + worker_specs.append({ + 'func': run_direct_batch_worker, + 'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + else: + worker_id_counter = 0 + for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + if not prefix_str: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue - for i in range(pool_workers): - assigned_prefix = prefixes[i % len(prefixes)] - worker_policy = deepcopy(policy) - worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix - worker_specs.append({ - 'func': run_direct_batch_worker, - 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) - }) - worker_id_counter += 1 + for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + worker_policy = deepcopy(policy) + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_batch_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + worker_id_counter += 1 with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] @@ -742,6 +801,7 @@ def main_stress_policy(args): if start_index > 0: logger.info(f"Starting/resuming from URL index {start_index + 1}.") + logger.info(f"Task source file: {os.path.abspath(urls_file)}") sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) if args.dry_run: return 0 @@ -749,50 +809,20 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") direct_policy = policy.get('direct_docker_cli_policy', {}) use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') manager_for_discovery = profile_managers.get(use_env) - + if not manager_for_discovery: logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -804,30 +834,67 @@ def main_stress_policy(args): return 1 if args.profile_prefix: - logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.") + cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()} + + original_pool_count = len(worker_pools) + filtered_pools = [] for pool in worker_pools: - pool['profile_prefix'] = args.profile_prefix + pool_prefixes_str = pool.get('profile_prefix', '') + if not pool_prefixes_str: + continue + pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()} + if pool_prefixes.intersection(cli_prefixes): + filtered_pools.append(pool) + + if len(filtered_pools) < original_pool_count: + logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.") + + worker_pools = filtered_pools + + if not worker_pools: + logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.") worker_specs = [] - worker_id_counter = 0 - for pool in worker_pools: - pool_workers = pool.get('workers', 1) - prefix_str = pool.get('profile_prefix', '') - prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] - if not prefixes: - logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") - continue + if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1: + logger.info("Single worker mode: aggregating all discovered profile groups for the worker.") + all_prefixes = [] + for pool in worker_pools: + prefix_str = pool.get('profile_prefix', '') + if prefix_str: + all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip()) + final_prefix_str = ','.join(sorted(list(set(all_prefixes)))) + logger.info(f"Single worker will manage profile groups: {final_prefix_str}") - # Each worker in the pool gets the full list of prefixes from the pool configuration. - for i in range(pool_workers): - worker_policy = deepcopy(policy) - # The worker functions will now handle a comma-separated list of prefixes. - worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str - worker_specs.append({ - 'func': run_direct_docker_worker, - 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) - }) - worker_id_counter += 1 + worker_policy = deepcopy(policy) + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str + worker_specs.append({ + 'func': run_direct_docker_worker, + 'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + else: + worker_id_counter = 0 + for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + if not prefix_str: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_docker_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + worker_id_counter += 1 with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] @@ -847,6 +914,7 @@ def main_stress_policy(args): logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") return 1 + logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}") sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 @@ -854,50 +922,20 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") direct_policy = policy.get('direct_docker_cli_policy', {}) use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') manager_for_discovery = profile_managers.get(use_env) - + if not manager_for_discovery: logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -909,30 +947,67 @@ def main_stress_policy(args): return 1 if args.profile_prefix: - logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.") + cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()} + + original_pool_count = len(worker_pools) + filtered_pools = [] for pool in worker_pools: - pool['profile_prefix'] = args.profile_prefix + pool_prefixes_str = pool.get('profile_prefix', '') + if not pool_prefixes_str: + continue + pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()} + if pool_prefixes.intersection(cli_prefixes): + filtered_pools.append(pool) + + if len(filtered_pools) < original_pool_count: + logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.") + + worker_pools = filtered_pools + + if not worker_pools: + logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.") worker_specs = [] - worker_id_counter = 0 - for pool in worker_pools: - pool_workers = pool.get('workers', 1) - prefix_str = pool.get('profile_prefix', '') - prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] - if not prefixes: - logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") - continue + if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1: + logger.info("Single worker mode: aggregating all discovered profile groups for the worker.") + all_prefixes = [] + for pool in worker_pools: + prefix_str = pool.get('profile_prefix', '') + if prefix_str: + all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip()) + final_prefix_str = ','.join(sorted(list(set(all_prefixes)))) + logger.info(f"Single worker will manage profile groups: {final_prefix_str}") - # Each worker in the pool gets the full list of prefixes from the pool configuration. - for i in range(pool_workers): - worker_policy = deepcopy(policy) - # The worker functions will now handle a comma-separated list of prefixes. - worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str - worker_specs.append({ - 'func': run_direct_docker_download_worker, - 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock) - }) - worker_id_counter += 1 + worker_policy = deepcopy(policy) + worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str + worker_specs.append({ + 'func': run_direct_docker_download_worker, + 'args': (0, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock) + }) + else: + worker_id_counter = 0 + for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + if not prefix_str: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_docker_download_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock) + }) + worker_id_counter += 1 with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] @@ -968,6 +1043,7 @@ def main_stress_policy(args): logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") return 1 + logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}") sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 @@ -975,48 +1051,18 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") manager_for_discovery = profile_managers.get('download') if not manager_for_discovery: logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -1028,30 +1074,67 @@ def main_stress_policy(args): return 1 if args.profile_prefix: - logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.") + cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()} + + original_pool_count = len(worker_pools) + filtered_pools = [] for pool in worker_pools: - pool['profile_prefix'] = args.profile_prefix + pool_prefixes_str = pool.get('profile_prefix', '') + if not pool_prefixes_str: + continue + pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()} + if pool_prefixes.intersection(cli_prefixes): + filtered_pools.append(pool) + + if len(filtered_pools) < original_pool_count: + logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.") + + worker_pools = filtered_pools + + if not worker_pools: + logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.") worker_specs = [] - worker_id_counter = 0 - for pool in worker_pools: - pool_workers = pool.get('workers', 1) - prefix_str = pool.get('profile_prefix', '') - prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] - if not prefixes: - logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") - continue + if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1: + logger.info("Single worker mode: aggregating all discovered profile groups for the worker.") + all_prefixes = [] + for pool in worker_pools: + prefix_str = pool.get('profile_prefix', '') + if prefix_str: + all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip()) + final_prefix_str = ','.join(sorted(list(set(all_prefixes)))) + logger.info(f"Single worker will manage profile groups: {final_prefix_str}") - # Each worker in the pool gets the full list of prefixes from the pool configuration. - for i in range(pool_workers): - worker_policy = deepcopy(policy) - # The worker functions will now handle a comma-separated list of prefixes. - worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str - worker_specs.append({ - 'func': run_direct_download_worker, - 'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock) - }) - worker_id_counter += 1 + worker_policy = deepcopy(policy) + worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str + worker_specs.append({ + 'func': run_direct_download_worker, + 'args': (0, worker_policy, state_manager, args, download_manager, running_processes, process_lock) + }) + else: + worker_id_counter = 0 + for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + if not prefix_str: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_download_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock) + }) + worker_id_counter += 1 with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] @@ -1129,48 +1212,18 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") manager_for_discovery = profile_managers.get('auth') if not manager_for_discovery: logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -1189,6 +1242,8 @@ def main_stress_policy(args): worker_specs = [] worker_id_counter = 0 for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break pool_workers = pool.get('workers', 1) prefix_str = pool.get('profile_prefix', '') prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] @@ -1198,6 +1253,8 @@ def main_stress_policy(args): # Each worker in the pool gets the full list of prefixes from the pool configuration. for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break worker_policy = deepcopy(policy) # The worker functions will now handle a comma-separated list of prefixes. worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str @@ -1285,48 +1342,18 @@ def main_stress_policy(args): worker_pools = exec_control.get('worker_pools', []) discovery_config = exec_control.get('worker_pool_discovery') - if discovery_config: - if worker_pools: - logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.") - - discovery_pattern = discovery_config.get('profile_prefix_pattern') - workers_per_group = discovery_config.get('workers_per_profile_group', 1) - + if discovery_config and not worker_pools: + logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.") manager_for_discovery = profile_managers.get('download') if not manager_for_discovery: logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") return 1 - if not discovery_pattern: - logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") - return 1 - - logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...") - try: - all_profiles = manager_for_discovery.list_profiles() - found_prefixes = set() - for profile in all_profiles: - profile_name = profile['name'] - if fnmatch.fnmatch(profile_name, discovery_pattern): - # Assuming standard name format like 'user31_001', extract 'user31' - prefix = profile_name.rsplit('_', 1)[0] - found_prefixes.add(prefix) - - if not found_prefixes: - logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.") - worker_pools = [] - else: - worker_pools = [] - for prefix in sorted(list(found_prefixes)): - worker_pools.append({ - 'profile_prefix': prefix, - 'workers': workers_per_group - }) - logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}") - except Exception as e: - logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True) - return 1 + discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery) + if discovered_pools is None: + return 1 # An error occurred + worker_pools = discovered_pools if not worker_pools and exec_control.get('workers'): # Fallback for legacy 'workers: N' config @@ -1345,6 +1372,8 @@ def main_stress_policy(args): worker_specs = [] worker_id_counter = 0 for pool in worker_pools: + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break pool_workers = pool.get('workers', 1) prefix_str = pool.get('profile_prefix', '') prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] @@ -1354,6 +1383,8 @@ def main_stress_policy(args): # Each worker in the pool gets the full list of prefixes from the pool configuration. for i in range(pool_workers): + if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers: + break worker_policy = deepcopy(policy) # The worker functions will now handle a comma-separated list of prefixes. worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str