Updates on single source of profile list in cluster.yaml, fix single tmux worker naming, updates ytops to support workers without profile list

This commit is contained in:
aperez 2026-01-16 11:01:35 +03:00
parent bf12118b2b
commit 4fd9217c6d
24 changed files with 2794 additions and 1112 deletions

View File

@ -108,12 +108,20 @@ ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.gree
### Profile Management ### Profile Management
The `cleanup-profiles` action can be used to remove profiles from Redis. By default, it cleans up "ungrouped" profiles.
```bash ```bash
# Clean up all profiles # Perform a dry run of cleaning up ungrouped profiles
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "dry_run=true"
# Clean up ungrouped profiles
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles"
# Clean up specific profile prefix # To clean up ALL profiles (destructive), set cleanup_mode=full
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1" ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "cleanup_mode=full"
# You can specify a custom setup policy file for cleanup operations
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "setup_policy=policies/my_custom_setup_policy.yaml"
``` ```
## Monitoring and Inspection ## Monitoring and Inspection
@ -157,7 +165,19 @@ Then, from the jump host, you can sync code or policies to the cluster nodes:
ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
# Sync only policies and CLI configs # Sync only policies and CLI configs
ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini
# To sync files from a custom source directory on the Ansible controller, use the 'source_base_dir' extra variable:
ansible-playbook ansible/playbook-stress-sync-policies.yml -i ansible/inventory.green.ini -e "source_base_dir=/path/to/my-custom-source"
```
### Docker Image Updates
To update the `yt-dlp` docker image used by download simulators, run the following playbook. This builds the image locally on each worker node.
```bash
# Build the yt-dlp docker image locally on each worker node
ansible-playbook ansible/playbook-update-yt-dlp-docker.yml -i ansible/inventory.green.ini
``` ```
### Adding a New Worker ### Adding a New Worker
@ -225,7 +245,10 @@ ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.
### Restart Enforcer and Monitoring ### Restart Enforcer and Monitoring
```bash ```bash
# Restart monitoring and enforcer on master # Restart monitoring and enforcer on master using default policies
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring"
# Restart using a custom enforcer policy
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" -e "enforcer_policy=policies/my_other_enforcer.yaml"
``` ```

View File

@ -1,4 +1,35 @@
--- ---
# -------------------------------------------------------------------------------------------------
# PHASE 0: Fix Python Dependencies
# Ensures remote hosts have compatible Python libraries to prevent module failures.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 0: Upgrade Python SSL libraries"
hosts: all
gather_facts: no
tasks:
- name: Attempt to upgrade pyOpenSSL, cryptography and urllib3
ansible.builtin.shell:
cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3"
become: yes
register: pip_upgrade_result
changed_when: "'Successfully installed' in pip_upgrade_result.stdout"
failed_when: false
- name: Retry upgrade with --break-system-packages on specific error
ansible.builtin.shell:
cmd: "pip3 install --upgrade pyOpenSSL cryptography urllib3 --break-system-packages"
become: yes
register: pip_upgrade_result_retry
changed_when: "'Successfully installed' in pip_upgrade_result_retry.stdout"
when: pip_upgrade_result.rc != 0 and 'externally-managed-environment' in pip_upgrade_result.stderr
- name: Fail if package upgrade did not succeed
ansible.builtin.fail:
msg: "Failed to upgrade Python packages after retry. Last error: {{ pip_upgrade_result_retry.stderr | default(pip_upgrade_result.stderr) }}"
when: >
pip_upgrade_result.rc != 0 and
(pip_upgrade_result_retry is not defined or pip_upgrade_result_retry.rc != 0)
# This playbook provides a complete installation for fresh nodes. # This playbook provides a complete installation for fresh nodes.
# It can install either master or worker roles, or both on the same machine. # It can install either master or worker roles, or both on the same machine.
# #
@ -55,57 +86,10 @@
# ------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------
# PHASE 4: Build yt-dlp Docker Image # PHASE 4: Build yt-dlp Docker Image
# Builds the yt-dlp container from bin/ directory # Builds the yt-dlp container on each worker node using the dedicated playbook.
# ------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------
- name: "PHASE 4: Build yt-dlp Docker image" - name: "PHASE 4: Build yt-dlp Docker image on workers"
hosts: all import_playbook: playbook-update-yt-dlp-docker.yml
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}"
- name: Ensure bin directory exists
ansible.builtin.file:
path: "{{ base_dir }}/bin"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Check if Dockerfile exists in bin directory
ansible.builtin.stat:
path: "{{ base_dir }}/bin/Dockerfile"
register: dockerfile_stat
- name: Build yt-dlp Docker image if Dockerfile exists
community.docker.docker_image:
name: yt-dlp-custom
tag: latest
source: build
build:
path: "{{ base_dir }}/bin"
pull: yes
state: present
force_source: yes
become: yes
when: dockerfile_stat.stat.exists
- name: Display message if Dockerfile not found
ansible.builtin.debug:
msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build"
when: not dockerfile_stat.stat.exists
# ------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------
# PHASE 5: Sync Code and Install Dependencies # PHASE 5: Sync Code and Install Dependencies
@ -148,6 +132,12 @@
with_fileglob: with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Install redis-tools
ansible.builtin.apt:
name: redis-tools
state: present
become: yes
- name: Configure system performance and kernel settings - name: Configure system performance and kernel settings
ansible.builtin.copy: ansible.builtin.copy:
src: "configs/etc/sysctl.d/99-system-limits.conf" src: "configs/etc/sysctl.d/99-system-limits.conf"
@ -174,7 +164,7 @@
- name: Template Docker Compose file for master services - name: Template Docker Compose file for master services
ansible.builtin.template: ansible.builtin.template:
src: templates/docker-compose.stress-master.j2 src: docker-compose.stress-master.j2
dest: "{{ airflow_master_dir }}/docker-compose.stress.yml" dest: "{{ airflow_master_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}" owner: "{{ ansible_user }}"
group: "{{ deploy_group }}" group: "{{ deploy_group }}"
@ -184,9 +174,7 @@
- name: Stop and remove existing containers before starting services - name: Stop and remove existing containers before starting services
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes become: yes
changed_when: false changed_when: false
ignore_errors: yes ignore_errors: yes
@ -200,6 +188,14 @@
remove_orphans: true remove_orphans: true
become: yes become: yes
- name: Wait for Redis service to be ready
ansible.builtin.wait_for:
host: localhost
port: "{{ redis_port }}"
delay: 5
timeout: 60
delegate_to: "{{ inventory_hostname }}"
- name: Wait for MinIO service to be ready - name: Wait for MinIO service to be ready
ansible.builtin.wait_for: ansible.builtin.wait_for:
host: "{{ hostvars[inventory_hostname].ansible_host }}" host: "{{ hostvars[inventory_hostname].ansible_host }}"
@ -240,7 +236,7 @@
register: mc_mb_result register: mc_mb_result
failed_when: > failed_when: >
mc_mb_result.rc != 0 and mc_mb_result.rc != 0 and
"already exists" not in mc_mb_result.stderr "already own it" not in mc_mb_result.stderr
changed_when: mc_mb_result.rc == 0 changed_when: mc_mb_result.rc == 0
environment: environment:
HOME: "/home/{{ ansible_user }}" HOME: "/home/{{ ansible_user }}"
@ -264,7 +260,7 @@
tasks: tasks:
- name: Template Docker Compose file for worker services - name: Template Docker Compose file for worker services
ansible.builtin.template: ansible.builtin.template:
src: templates/docker-compose.stress-master.j2 src: docker-compose.stress-master.j2
dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml" dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}" owner: "{{ ansible_user }}"
group: "{{ deploy_group }}" group: "{{ deploy_group }}"
@ -274,9 +270,9 @@
- name: Stop and remove existing containers before starting services - name: Stop and remove existing containers before starting services
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f docker ps -a --filter "name=bgutil-provider" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f docker ps -a --filter "name=redis-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f docker ps -a --filter "name=minio-stress" --format "{{ '{{.ID}}' }}" | xargs -r docker rm -f
become: yes become: yes
changed_when: false changed_when: false
ignore_errors: yes ignore_errors: yes

View File

@ -0,0 +1,41 @@
---
- name: "STRESS-CLEANUP: Remove info.json task files from workers"
hosts: workers
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define the directory to be cleaned
ansible.builtin.set_fact:
target_dir: "{{ airflow_worker_dir }}/run/docker_mount/info_json_tasks/direct_docker_simulation"
- name: "Display directory being cleaned"
ansible.builtin.debug:
msg: "Cleaning directory: {{ target_dir }} on {{ inventory_hostname }}"
- name: Remove the info_json_tasks directory
ansible.builtin.file:
path: "{{ target_dir }}"
state: absent
become: yes
- name: Recreate the info_json_tasks directory
ansible.builtin.file:
path: "{{ target_dir }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: "Display cleanup completion"
ansible.builtin.debug:
msg: "Successfully cleaned and recreated {{ target_dir }} on {{ inventory_hostname }}"

View File

@ -1,10 +1,11 @@
--- ---
- name: "STRESS-SETUP: Unified control for stress test processes" - name: "STRESS-SETUP: Unified control for stress test processes"
hosts: all hosts: "{{ 'master' if action in master_only_actions else 'all' }}"
gather_facts: no gather_facts: no
vars: vars:
# Default action is status check # Default action is status check
action: "status" action: "status"
graceful_shutdown_timeout_seconds: 30
setup_policy: "policies/6_profile_setup_policy.yaml" setup_policy: "policies/6_profile_setup_policy.yaml"
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml" enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
master_only_actions: master_only_actions:
@ -80,9 +81,7 @@
if [ -f .env ]; then if [ -f .env ]; then
set -a && . ./.env && set +a set -a && . ./.env && set +a
fi fi
timeout 10 ./bin/ytops-client profile list \ timeout 10 ./bin/ytops-client profile list 2>&1
--auth-env sim_auth \
--download-env sim_download 2>&1
register: profile_list_output register: profile_list_output
changed_when: false changed_when: false
when: when:
@ -132,7 +131,12 @@
- name: "Display policy being used for profile cleanup" - name: "Display policy being used for profile cleanup"
ansible.builtin.debug: ansible.builtin.debug:
msg: "Using setup policy for cleanup: {{ setup_policy }}" msg: >-
{% if cleanup_mode | default('ungrouped') == 'full' %}
Performing a FULL cleanup of all profiles using policy: {{ setup_policy }}
{% else %}
Cleaning up UNGROUPED profiles using policy: {{ setup_policy }}{% if dry_run | default(false) %} (DRY RUN){% endif %}
{% endif %}
when: when:
- action == "cleanup-profiles" - action == "cleanup-profiles"
- inventory_hostname in groups['master'] - inventory_hostname in groups['master']
@ -146,9 +150,15 @@
if [ -f .env ]; then if [ -f .env ]; then
set -a && . ./.env && set +a set -a && . ./.env && set +a
fi fi
{% if cleanup_mode | default('ungrouped') == 'full' %}
./bin/ytops-client setup-profiles \ ./bin/ytops-client setup-profiles \
--policy {{ setup_policy }} \ --policy {{ setup_policy }} \
--cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %} --cleanup-all
{% else %}
./bin/ytops-client profile cleanup-ungrouped \
--policy-file {{ setup_policy }} \
{% if dry_run | default(false) %}--dry-run{% endif %}
{% endif %}
register: cleanup_output register: cleanup_output
changed_when: false changed_when: false
when: when:
@ -172,8 +182,34 @@
- cleanup_output is defined - cleanup_output is defined
- name: Stop all stress test processes on all nodes (stop-all action) - name: Stop all stress test processes on all nodes (stop-all action)
vars:
process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
block: block:
- name: Kill all tmux sessions starting with 'stress-' - name: "Get PIDs of running stress test processes"
ansible.builtin.shell:
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
register: pids_to_kill
changed_when: false
- name: "Gracefully terminate stress test processes"
ansible.builtin.shell:
cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
when: pids_to_kill.stdout | length > 0
ignore_errors: true
changed_when: false
- name: "Wait for graceful shutdown"
ansible.builtin.pause:
seconds: "{{ graceful_shutdown_timeout_seconds }}"
when: pids_to_kill.stdout | length > 0
- name: "Force kill any lingering stress test processes"
ansible.builtin.shell:
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
ignore_errors: true
changed_when: false
- name: "Kill all stress-related tmux sessions as a failsafe"
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
@ -186,27 +222,37 @@
fi fi
ignore_errors: yes ignore_errors: yes
changed_when: false changed_when: false
- name: Kill all ytops-client and related python processes
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-all" when: action == "stop-all"
- name: Stop processes on targeted nodes only (stop-nodes action) - name: Stop processes on targeted nodes only (stop-nodes action)
vars:
process_pattern: "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)|[p]ython.*ytops"
block: block:
- name: Kill all tmux sessions starting with 'stress-' on this node - name: "Get PIDs of running stress test processes"
ansible.builtin.shell:
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}'"
register: pids_to_kill
changed_when: false
- name: "Gracefully terminate stress test processes"
ansible.builtin.shell:
cmd: "kill {{ pids_to_kill.stdout_lines | join(' ') }}"
when: pids_to_kill.stdout | length > 0
ignore_errors: true
changed_when: false
- name: "Wait for graceful shutdown"
ansible.builtin.pause:
seconds: "{{ graceful_shutdown_timeout_seconds }}"
when: pids_to_kill.stdout | length > 0
- name: "Force kill any lingering stress test processes"
ansible.builtin.shell:
cmd: "ps aux | grep -E '{{ process_pattern }}' | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true"
ignore_errors: true
changed_when: false
- name: "Kill all stress-related tmux sessions as a failsafe"
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
@ -219,22 +265,6 @@
fi fi
ignore_errors: yes ignore_errors: yes
changed_when: false changed_when: false
- name: Kill all ytops-client and related python processes on this node
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-nodes" when: action == "stop-nodes"
- name: Restart monitoring and enforcer (restart-monitoring action) - name: Restart monitoring and enforcer (restart-monitoring action)
@ -263,8 +293,6 @@
working_dir: "{{ airflow_master_dir }}" working_dir: "{{ airflow_master_dir }}"
command_to_run: > command_to_run: >
./bin/ytops-client profile list ./bin/ytops-client profile list
--auth-env sim_auth
--download-env sim_download
--live --live
--no-blink --no-blink
--show-reasons --show-reasons

View File

@ -3,7 +3,7 @@
hosts: workers hosts: workers
gather_facts: no gather_facts: no
vars: vars:
tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}" tmux_session_download: "{{ 'stress-download-worker-' + (worker_num | string) if (profile_prefix | default('') == 'worker' and worker_num is defined) else 'stress-download-' + (profile_prefix | default('default')) | replace(',', '-') }}"
download_policy: "policies/11_direct_docker_download_simulation.yaml" download_policy: "policies/11_direct_docker_download_simulation.yaml"
vars_files: vars_files:
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
@ -41,13 +41,14 @@
command_to_run: > command_to_run: >
./bin/ytops-client stress-policy ./bin/ytops-client stress-policy
--policy {{ download_policy }} --policy {{ download_policy }}
{% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} {# {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} #}
{% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %} {% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %}
{% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %} {% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %}
{% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %} {# --set 'execution_control.worker_pools=[{"profile_prefix": "{% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}", "workers": 1}]' #}
{% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %} {% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %}
--profile-prefix {{ profile_prefix }} {% if profile_prefix == 'worker' %}--workers 1{% endif %}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}" --profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}{% if profile_prefix == 'worker' %}.*--workers 1{% endif %}.*--profile-prefix {% if profile_prefix == 'worker' %}{{ display_prefix }}{% else %}{{ profile_prefix }}{% endif %}"
start_process: "{{ start_download | default(false) | bool }}" start_process: "{{ start_download | default(false) | bool }}"
stop_process: "{{ stop_download | default(false) | bool }}" stop_process: "{{ stop_download | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}" check_status: "{{ vars.check_status | default(false) | bool }}"

View File

@ -15,16 +15,28 @@
with_fileglob: with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Check if Redis is running - name: Wait for Redis to be available
ansible.builtin.wait_for:
host: localhost
port: "{{ redis_port }}"
timeout: 60
delay: 5
register: redis_port_check
ignore_errors: yes
- name: Check if Redis is running and responding to commands
ansible.builtin.shell: ansible.builtin.shell:
cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG" cmd: "redis-cli -h localhost -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG"
register: redis_check register: redis_check
ignore_errors: yes ignore_errors: yes
changed_when: false changed_when: false
retries: 3
delay: 5
until: redis_check.rc == 0
- name: Ensure Redis is accessible - name: Ensure Redis is accessible
ansible.builtin.fail: ansible.builtin.fail:
msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}" msg: "Redis is not accessible on master node. Please ensure Redis service is running on localhost:{{ redis_port }}"
when: redis_check.rc != 0 when: redis_check.rc != 0
- name: Stop any running ytops-client processes on master - name: Stop any running ytops-client processes on master
@ -44,7 +56,7 @@
--policy {{ setup_policy }} \ --policy {{ setup_policy }} \
--cleanup-all --cleanup-all
environment: environment:
REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}" REDIS_HOST: "localhost"
REDIS_PORT: "{{ redis_port }}" REDIS_PORT: "{{ redis_port }}"
REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}" REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}"
register: init_result register: init_result

View File

@ -37,18 +37,52 @@
ansible.builtin.set_fact: ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Install required Python packages from requirements.txt - name: Attempt to install required Python packages from requirements.txt
ansible.builtin.pip:
requirements: "{{ base_dir }}/ytops_client/requirements.txt"
extra_args: "--ignore-installed"
become: yes
register: pip_reqs_result
failed_when: false
- name: Retry installing requirements with break-system-packages
ansible.builtin.pip: ansible.builtin.pip:
requirements: "{{ base_dir }}/ytops_client/requirements.txt" requirements: "{{ base_dir }}/ytops_client/requirements.txt"
extra_args: "--ignore-installed" extra_args: "--ignore-installed"
become: yes become: yes
environment: environment:
PIP_BREAK_SYSTEM_PACKAGES: "1" PIP_BREAK_SYSTEM_PACKAGES: "1"
register: pip_reqs_result_retry
when: pip_reqs_result.failed and 'externally-managed-environment' in pip_reqs_result.msg
- name: Explicitly install the thrift package - name: Fail if requirements installation did not succeed
ansible.builtin.fail:
msg: "Failed to install requirements after retry. Last error: {{ pip_reqs_result_retry.msg | default(pip_reqs_result.msg) }}"
when: >
pip_reqs_result.failed and
(pip_reqs_result_retry is not defined or pip_reqs_result_retry.failed)
- name: Attempt to explicitly install the thrift package
ansible.builtin.pip:
name: thrift
extra_args: "--ignore-installed"
become: yes
register: pip_thrift_result
failed_when: false
- name: Retry installing thrift with break-system-packages
ansible.builtin.pip: ansible.builtin.pip:
name: thrift name: thrift
extra_args: "--ignore-installed" extra_args: "--ignore-installed"
become: yes become: yes
environment: environment:
PIP_BREAK_SYSTEM_PACKAGES: "1" PIP_BREAK_SYSTEM_PACKAGES: "1"
register: pip_thrift_result_retry
when: pip_thrift_result.failed and 'externally-managed-environment' in pip_thrift_result.msg
- name: Fail if thrift installation did not succeed
ansible.builtin.fail:
msg: "Failed to install thrift after retry. Last error: {{ pip_thrift_result_retry.msg | default(pip_thrift_result.msg) }}"
when: >
pip_thrift_result.failed and
(pip_thrift_result_retry is not defined or pip_thrift_result_retry.failed)

View File

@ -4,9 +4,15 @@
gather_facts: no gather_facts: no
vars: vars:
# Default action # Default action
action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download, stop-generator action: "status" # Available actions: start, stop, status, start-auth, stop-auth, start-download, stop-download
graceful_shutdown_timeout_seconds: 30
tasks: tasks:
- name: "Ensure profile_prefixes is a flat list of all prefixes from profile_pools"
ansible.builtin.set_fact:
profile_prefixes: "{{ profile_pools | map(attribute='prefixes') | flatten }}"
when: profile_pools is defined
- name: "Start all configured generators and simulators" - name: "Start all configured generators and simulators"
when: action == "start" when: action == "start"
block: block:
@ -25,7 +31,7 @@
--limit {{ inventory_hostname }} --limit {{ inventory_hostname }}
-e "start_generator=true" -e "start_generator=true"
-e "profile_prefix={{ combined_prefixes }}" -e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} -e "dummy_batch={{ dummy_batch | default(false) }}"
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -42,7 +48,7 @@
--limit {{ inventory_hostname }} --limit {{ inventory_hostname }}
-e "start_generator=true" -e "start_generator=true"
-e "profile_prefix={{ item }}" -e "profile_prefix={{ item }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} -e "dummy_batch={{ dummy_batch | default(false) }}"
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -56,42 +62,14 @@
label: "profile: {{ item }}" label: "profile: {{ item }}"
when: auth_workers_per_profile | default(0) | int > 0 when: auth_workers_per_profile | default(0) | int > 0
- name: "Start download simulator(s)" - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
when: profile_prefixes is defined and profile_prefixes | length > 0 ansible.builtin.set_fact:
block: download_workers_total: "{{ auth_workers_total | default(0) }}"
- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}" download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
- name: "Start parallel download simulators for each profile" - name: "Start download simulator(s)"
ansible.builtin.command: >- ansible.builtin.include_tasks: tasks/start-download-simulators.yml
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml when: profile_prefixes is defined and profile_prefixes | length > 0
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ item }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
loop: "{{ profile_prefixes }}"
loop_control:
loop_var: item
label: "profile: {{ item }}"
when: download_workers_per_profile | default(0) | int > 0
- name: "Start only auth generators on workers" - name: "Start only auth generators on workers"
when: action == "start-auth" when: action == "start-auth"
@ -111,7 +89,7 @@
--limit {{ inventory_hostname }} --limit {{ inventory_hostname }}
-e "start_generator=true" -e "start_generator=true"
-e "profile_prefix={{ combined_prefixes }}" -e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} -e "dummy_batch={{ dummy_batch | default(false) }}"
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -128,7 +106,7 @@
--limit {{ inventory_hostname }} --limit {{ inventory_hostname }}
-e "start_generator=true" -e "start_generator=true"
-e "profile_prefix={{ item }}" -e "profile_prefix={{ item }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} -e "dummy_batch={{ dummy_batch | default(false) }}"
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
@ -145,57 +123,33 @@
- name: "Start only download simulators on workers" - name: "Start only download simulators on workers"
when: action == "start-download" when: action == "start-download"
block: block:
- name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
ansible.builtin.set_fact:
download_workers_total: "{{ auth_workers_total | default(0) }}"
download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
- name: "Set combined profile prefixes string" - name: "Set combined profile prefixes string"
ansible.builtin.set_fact: ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0 when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Start download simulator(s)" - name: "Start download simulator(s)"
ansible.builtin.include_tasks: tasks/start-download-simulators.yml
when: profile_prefixes is defined and profile_prefixes | length > 0 when: profile_prefixes is defined and profile_prefixes | length > 0
block:
- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
- name: "Start parallel download simulators for each profile"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ item }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
loop: "{{ profile_prefixes }}"
loop_control:
loop_var: item
label: "profile: {{ item }}"
when: download_workers_per_profile | default(0) | int > 0
- name: "Stop only auth generators on workers (via playbook call)" - name: "Stop only auth generators on workers"
when: action == "stop-generator" when: action == "stop-auth"
block: block:
- name: "Set combined profile prefixes string" - name: "Set combined profile prefixes string"
ansible.builtin.set_fact: ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0 when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Stop single auth generator for profiles: {{ combined_prefixes | default('none') }}" - name: "Gracefully stop auth generator(s) via playbook call"
when: profile_prefixes is defined and profile_prefixes | length > 0
block:
- name: "Stop single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >- ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-i {{ inventory_file }} -i {{ inventory_file }}
@ -204,53 +158,55 @@
-e "profile_prefix={{ combined_prefixes }}" -e "profile_prefix={{ combined_prefixes }}"
delegate_to: localhost delegate_to: localhost
changed_when: true changed_when: true
when: profile_prefixes is defined and profile_prefixes | length > 0 when: (auth_workers_per_profile | default(0) | int == 0) and (auth_workers_total | default(0) | int > 0)
- name: "Stop only auth generators on workers" - name: "Stop parallel auth generators for each profile"
when: action == "stop-auth" ansible.builtin.command: >-
block: ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
- name: Kill all auth generator tmux sessions on this worker -i {{ inventory_file }}
ansible.builtin.shell: --limit {{ inventory_hostname }}
cmd: | -e "stop_generator=true"
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-auth-"); do -e "profile_prefix={{ item }}"
tmux kill-session -t "$session" delegate_to: localhost
done || true changed_when: true
ignore_errors: yes loop: "{{ profile_prefixes }}"
changed_when: false loop_control:
loop_var: item
- name: Kill all ytops-client auth generator processes on this worker label: "profile: {{ item }}"
ansible.builtin.shell: when: auth_workers_per_profile | default(0) | int > 0
cmd: |
# Gracefully terminate
ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 0.5
# Force kill
ps aux | grep "[y]tops-client.*stress-policy.*12_queue_auth_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
- name: "Stop only download simulators on workers" - name: "Stop only download simulators on workers"
when: action == "stop-download" when: action == "stop-download"
block: block:
- name: Kill all download simulator tmux sessions on this worker - name: "WORKAROUND: Align download worker config with auth worker config to bypass inventory bug"
ansible.builtin.set_fact:
download_workers_total: "{{ auth_workers_total | default(0) }}"
download_workers_per_profile: "{{ auth_workers_per_profile | default(0) }}"
- name: "Set combined profile prefixes string"
ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Stop single download simulator group"
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-download-"); do for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$'); do
tmux kill-session -t "$session" tmux kill-session -t "$session"
done || true done || true
when: download_workers_total | default(0) | int > 0
changed_when: true
ignore_errors: yes ignore_errors: yes
changed_when: false
- name: Kill all ytops-client download simulator processes on this worker - name: "Stop parallel download simulators for each profile"
ansible.builtin.shell: ansible.builtin.command: "tmux kill-session -t stress-download-{{ item }}"
cmd: | loop: "{{ profile_prefixes }}"
# Gracefully terminate loop_control:
ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true loop_var: item
sleep 0.5 label: "profile: {{ item }}"
# Force kill when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0)
ps aux | grep "[y]tops-client.*stress-policy.*11_direct_docker_download_simulation" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true changed_when: true
ignore_errors: yes ignore_errors: yes
changed_when: false
- name: "Stop all worker generators and simulators" - name: "Stop all worker generators and simulators"
when: action == "stop" when: action == "stop"

View File

@ -3,7 +3,7 @@
hosts: all hosts: all
gather_facts: no gather_facts: no
vars: vars:
ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source" ytops_source_dir: "{{ source_base_dir | default(playbook_dir + '/../ytops_client-source') }}"
vars_files: vars_files:
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks: pre_tasks:

View File

@ -0,0 +1,30 @@
---
- name: "STRESS-SETUP: Build and push yt-dlp docker image"
hosts: workers
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: "Build and push yt-dlp image on worker"
ansible.builtin.shell:
cmd: |
cd {{ airflow_worker_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
./bin/build-yt-dlp-image
register: build_output
changed_when: true
- name: "Display build output"
ansible.builtin.debug:
var: build_output.stdout_lines
when: build_output.stdout_lines is defined

View File

@ -0,0 +1,105 @@
---
- name: "Scale down excess download workers if necessary"
when: (download_workers_per_profile | default(0) | int == 0) and (download_workers_total | default(0) | int > 0)
block:
- name: "Find running download simulator tmux sessions for this group"
ansible.builtin.shell:
cmd: "tmux list-sessions -F '#{session_name}' 2>/dev/null | grep -E '^stress-download-worker-[0-9]+$' || true"
register: running_sessions
changed_when: false
ignore_errors: yes
- name: "Identify excess download simulator sessions to stop"
ansible.builtin.set_fact:
excess_sessions_to_stop: "{{ excess_sessions_to_stop | default([]) + [item] }}"
vars:
worker_num_str: "{{ item | regex_replace('^stress-download-worker-', '') }}"
when: worker_num_str is number and (worker_num_str | int > (download_workers_total | int))
loop: "{{ running_sessions.stdout_lines }}"
loop_control:
label: "Identifying excess session: {{ item }}"
- name: "Get PIDs for excess download workers"
ansible.builtin.shell:
cmd: |
PANE_PID=$(tmux list-panes -s "{{ item }}" -F '#{pane_pid}' | head -n 1)
if [ -n "$PANE_PID" ]; then
pgrep -P "$PANE_PID" || true
fi
register: excess_pids_raw
loop: "{{ excess_sessions_to_stop | default([]) }}"
changed_when: false
ignore_errors: true
- name: "Set fact for PIDs to kill"
ansible.builtin.set_fact:
pids_to_kill_gracefully: "{{ excess_pids_raw.results | map(attribute='stdout') | reject('==', '') | list }}"
- name: "Gracefully terminate excess download workers"
ansible.builtin.shell:
cmd: "kill {{ item }}"
loop: "{{ pids_to_kill_gracefully | default([]) }}"
when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
ignore_errors: true
changed_when: false
- name: "Wait for graceful shutdown of excess workers"
ansible.builtin.pause:
seconds: "{{ graceful_shutdown_timeout_seconds }}"
when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
- name: "Force kill any lingering excess workers"
ansible.builtin.shell:
cmd: "kill -9 {{ item }}"
loop: "{{ pids_to_kill_gracefully | default([]) }}"
when: pids_to_kill_gracefully is defined and pids_to_kill_gracefully | length > 0
ignore_errors: true
changed_when: false
- name: "Kill tmux sessions for excess workers"
ansible.builtin.shell:
cmd: "tmux kill-session -t {{ item }}"
loop: "{{ excess_sessions_to_stop | default([]) }}"
when: excess_sessions_to_stop is defined and excess_sessions_to_stop | length > 0
ignore_errors: true
changed_when: false
- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix=worker"
-e "display_prefix={{ combined_prefixes }}"
-e "worker_num={{ worker_num }}"
{% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
when: download_workers_total | default(0) | int > 0
loop: "{{ range(1, (download_workers_total | default(1) | int) + 1) | list }}"
loop_control:
loop_var: worker_num
label: "worker {{ worker_num }}"
- name: "Start parallel download simulators for each profile"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ item }}"
{% if dummy_batch | default(false) %}-e "dummy_batch=true"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
loop: "{{ profile_prefixes }}"
loop_control:
loop_var: item
label: "profile: {{ item }}"
when: (download_workers_total | default(0) | int == 0) and (download_workers_per_profile | default(0) | int > 0)

View File

@ -15,3 +15,7 @@ AWS_REGION={{ vault_s3_delivery_aws_region }}
ACCOUNT_ACTIVE_DURATION_MIN=7 ACCOUNT_ACTIVE_DURATION_MIN=7
ACCOUNT_COOLDOWN_DURATION_MIN=30 ACCOUNT_COOLDOWN_DURATION_MIN=30
STRESS_POLICY_INBOX_QUEUE=dev_stress_inbox STRESS_POLICY_INBOX_QUEUE=dev_stress_inbox
# --- Stress Test Environment Names ---
YTOPS_AUTH_ENV={{ stress_auth_env | default('sim_auth') }}
YTOPS_DOWNLOAD_ENV={{ stress_download_env | default('sim_download') }}

View File

@ -121,20 +121,214 @@ def generate_policy(cluster_config, output_path):
print(f"Successfully generated profile setup policy at: {output_path}") print(f"Successfully generated profile setup policy at: {output_path}")
def generate_enforcer_policy(cluster_config, output_path):
"""Generate the enforcer policy file."""
all_workers = cluster_config.get('workers', {})
enforcement_pools = []
for worker_name, worker_config in sorted(all_workers.items()):
all_prefixes = []
for pool in worker_config.get('profile_pools', []):
all_prefixes.extend(pool.get('prefixes', []))
if not all_prefixes:
continue
pool_entry = OrderedDict([
('name', f"server_{worker_name}"),
('profile_group_patterns', sorted(list(set(all_prefixes)))),
('max_active_profiles', 1)
])
enforcement_pools.append(pool_entry)
with open(output_path, 'w') as f:
f.write("# Policy for the unified simulation enforcer.\n")
f.write("# This file is used by `bin/ytops-client policy-enforcer --live` to manage\n")
f.write("# both the authentication and download simulation environments from a single process.\n\n")
f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
f.write("# !!! DO NOT EDIT. Your changes will be overwritten. !!!\n")
f.write("# !!! Edit cluster.green.yml and re-run the generator instead. !!!\n\n")
f.write("simulation_parameters:\n")
f.write(" # --- Common Redis settings for all tools ---\n")
f.write(" # The enforcer will connect to two different Redis environments (key prefixes)\n")
f.write(" # based on these settings, applying the corresponding policies to each.\n")
f.write(' env_file: ".env"\n')
f.write(' auth_env: "sim_auth"\n')
f.write(' download_env: "sim_download"\n')
f.write(" \n")
f.write(" # How often the enforcer should wake up and apply all policies.\n")
f.write(" interval_seconds: 2\n\n")
f.write("# --- Common & Pool-specific Settings ---\n")
f.write("# Common settings are applied to all profile groups discovered via the pools below.\n")
f.write("# A pool can optionally override these settings by defining its own 'group_settings' block.\n")
f.write("common_group_settings:\n")
f.write(" auth:\n")
f.write(" max_active_profiles: 1\n")
f.write(" rotate_after_requests: 5\n")
f.write(" rest_duration_minutes_on_rotation: 0.20\n")
f.write(" wait_download_finish_per_group: true\n")
f.write(" max_wait_for_downloads_minutes: 240\n")
f.write(" download:\n")
f.write(" max_active_profiles: 1\n")
f.write(" rotate_after_requests: 0\n")
f.write(" rest_duration_minutes_on_rotation: 0.2\n\n")
f.write("# Defines pools of profile groups with their own concurrency limits.\n")
f.write("enforcement_pools:\n")
for pool in enforcement_pools:
f.write(f' - name: "{pool["name"]}"\n')
patterns_str = ", ".join([f'"{p}"' for p in pool['profile_group_patterns']])
f.write(f' profile_group_patterns: [{patterns_str}]\n')
f.write(f' max_active_profiles: {pool["max_active_profiles"]}\n')
rest_of_file = """
# --- Policies for the Authentication Simulation ---
auth_policy_enforcer_config:
# Ban if 2 failures occur within a 1-minute window.
#ban_on_failures: 2
#ban_on_failures_window_minutes: 1
# The standard rest policy is disabled, as rotation is handled by the profile group.
# New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour.
rate_limit_requests: 0
rate_limit_window_minutes: 60
rate_limit_rest_duration_minutes: 5
rest_after_requests: 0
rest_duration_minutes: 10
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# The settings below should be configured to respect these limits.
# New setting for load balancing across profile groups.
# "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time).
# This is a global FIFO strategy that effectively cycles through profiles regardless of their group.
# "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads.
# If multiple groups have zero pending downloads, it acts as a FIFO queue, activating
# the one that finished its last download batch the earliest. This is useful when you want
# to ensure a group finishes its entire workload before another group starts.
profile_selection_strategy: "longest_idle"
# The 'global_max_active_profiles' setting is now superseded by the per-pool limits
# defined in the 'enforcement_pools' section.
# The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
# The enforcer logic should be updated to read from there.
proxy_work_minutes: 0
proxy_rest_duration_minutes: 0
# Global maximum time a proxy can be active before being rested, regardless of
# other rules. Acts as a safety net. Set to 0 to disable.
max_global_proxy_active_minutes: 0
rest_duration_on_max_active: 10
# Proxy-level ban on failure burst is disabled.
proxy_ban_on_failures: 0
proxy_ban_window_minutes: 2
# Clean up locks held for more than 16 minutes (960s) to prevent stuck workers.
# This should be longer than the docker container timeout (15m).
unlock_stale_locks_after_seconds: 960
# A short post-task cooldown for auth simulation profiles. When a batch is finished,
# the profile is put into COOLDOWN briefly. This prevents a worker from immediately
# re-locking the same profile, giving the policy enforcer a window to perform rotation.
unlock_cooldown_seconds: 0
# --- Cross-simulation synchronization ---
# This section is simplified because the link between auth and download profiles
# is now defined in the `profile_group_definitions`.
cross_simulation_sync:
# Which states to synchronize from auth to download.
sync_states:
- "BANNED"
# If true, a BANNED state on an auth profile will force the download profile to also be BANNED.
enforce_auth_lead: true
# CRITICAL: Ensures the correct download profile GROUP is active.
sync_active_profile: true
# When an auth profile is in the 'waiting_downloads' state, ensure the matching download profile is active.
sync_waiting_downloads: true
# --- Policies for the Download Simulation ---
download_policy_enforcer_config:
# Ban if 1 failure occurs within a 1-minute window.
ban_on_failures: 1
ban_on_failures_window_minutes: 1
# Standard rest policy is disabled in favor of group rotation.
# New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe.
rate_limit_requests: 280
rate_limit_window_minutes: 60
rate_limit_rest_duration_minutes: 5
rest_after_requests: 0
rest_duration_minutes: 20
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# The settings below should be configured to respect these limits.
# The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
# The enforcer logic should be updated to read from there.
# Time-based proxy rules are disabled.
proxy_work_minutes: 0
proxy_rest_duration_minutes: 10
# Global maximum time a proxy can be active before being rested, regardless of
# other rules. Acts as a safety net. Set to 0 to disable.
max_global_proxy_active_minutes: 0
rest_duration_on_max_active: 10
# Proxy-level ban on failure burst is disabled.
proxy_ban_on_failures: 3
proxy_ban_window_minutes: 1
# Clean up download locks held for more than 16 minutes (960s) to allow for long downloads.
# This should be longer than the docker container timeout (15m).
unlock_stale_locks_after_seconds: 960
# After a profile is used for a download, unlock it but put it in COOLDOWN
# state for 2-3s. This is enforced by the worker, which reads this config from Redis.
unlock_cooldown_seconds: [2, 3]
"""
with open(output_path, 'a') as f:
f.write(rest_of_file)
print(f"Successfully generated enforcer policy at: {output_path}")
def main(): def main():
if len(sys.argv) != 3: if len(sys.argv) < 3 or len(sys.argv) > 4:
print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-policy-file>") print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-profile-policy-file> [<output-enforcer-policy-file>]")
sys.exit(1) sys.exit(1)
config_path = sys.argv[1] config_path = sys.argv[1]
output_path = sys.argv[2] profile_output_path = sys.argv[2]
if not os.path.exists(config_path): if not os.path.exists(config_path):
print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr) print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr)
sys.exit(1) sys.exit(1)
cluster_config = load_cluster_config(config_path) cluster_config = load_cluster_config(config_path)
generate_policy(cluster_config, output_path) generate_policy(cluster_config, profile_output_path)
if len(sys.argv) == 4:
enforcer_output_path = sys.argv[3]
generate_enforcer_policy(cluster_config, enforcer_output_path)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -206,6 +206,18 @@ queue_policy:
# Example: formats_to_download: ["140-dashy", "299-dashy"] # Example: formats_to_download: ["140-dashy", "299-dashy"]
formats_to_download: "from_download_policy" formats_to_download: "from_download_policy"
# Whether to report completion back to a queue. Always reported for auth.
report_completion: true
# Queue to report completion to
completion_queue: "queue2_auth_completed"
# Queue to report failures to
failure_queue: "queue2_auth_fail"
# Queue to report skipped tasks to
skipped_queue: "queue2_auth_skipped"
simulation_parameters: simulation_parameters:
auth_env: "sim_auth" auth_env: "sim_auth"
download_env: "sim_download" download_env: "sim_download"

View File

@ -13,13 +13,10 @@ simulation_parameters:
# How often the enforcer should wake up and apply all policies. # How often the enforcer should wake up and apply all policies.
interval_seconds: 2 interval_seconds: 2
# --- Dynamic Profile Group Templates --- # --- Common & Pool-specific Settings ---
# The policy enforcer will find all profile prefixes matching a pattern in Redis # Common settings are applied to all profile groups discovered via the pools below.
# and apply the settings from the matching template. This avoids having to list # A pool can optionally override these settings by defining its own 'group_settings' block.
# every profile group manually. common_group_settings:
# NOTE: The policy enforcer tool must be updated to support this format.
profile_group_templates:
- pattern: "user*"
auth: auth:
max_active_profiles: 1 max_active_profiles: 1
rotate_after_requests: 5 rotate_after_requests: 5
@ -27,9 +24,19 @@ profile_group_templates:
wait_download_finish_per_group: true wait_download_finish_per_group: true
max_wait_for_downloads_minutes: 240 max_wait_for_downloads_minutes: 240
download: download:
max_active_profiles: 1
rotate_after_requests: 0 rotate_after_requests: 0
rest_duration_minutes_on_rotation: 0.2 rest_duration_minutes_on_rotation: 0.2
# Defines pools of profile groups with their own concurrency limits.
enforcement_pools:
- name: "server_dl003_pool"
profile_group_patterns: ["user31", "user32"]
max_active_profiles: 1
- name: "server_dl006_pool"
profile_group_patterns: ["user61", "user62"]
max_active_profiles: 1
# --- Policies for the Authentication Simulation --- # --- Policies for the Authentication Simulation ---
auth_policy_enforcer_config: auth_policy_enforcer_config:
@ -62,9 +69,8 @@ auth_policy_enforcer_config:
# to ensure a group finishes its entire workload before another group starts. # to ensure a group finishes its entire workload before another group starts.
profile_selection_strategy: "longest_idle" profile_selection_strategy: "longest_idle"
# Enforce a total limit of active profiles across all groups defined below. # The 'global_max_active_profiles' setting is now superseded by the per-pool limits
# Set to 1 to ensure only one group's profile is active at any time. # defined in the 'enforcement_pools' section.
global_max_active_profiles: 1
# The 'profile_groups' section is now inherited from 'profile_group_definitions' above. # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
# The enforcer logic should be updated to read from there. # The enforcer logic should be updated to read from there.
@ -106,6 +112,7 @@ cross_simulation_sync:
# --- Policies for the Download Simulation --- # --- Policies for the Download Simulation ---
download_policy_enforcer_config: download_policy_enforcer_config:
# Ban if 1 failure occurs within a 1-minute window. # Ban if 1 failure occurs within a 1-minute window.
ban_on_failures: 1 ban_on_failures: 1
ban_on_failures_window_minutes: 1 ban_on_failures_window_minutes: 1

View File

@ -68,6 +68,9 @@ class PolicyEnforcer:
all_profiles_list = self.manager.list_profiles() all_profiles_list = self.manager.list_profiles()
all_profiles_map = {p['name']: p for p in all_profiles_list} all_profiles_map = {p['name']: p for p in all_profiles_list}
# Sync profile states from their assigned proxy's state (e.g., if proxy is BANNED, ban profile).
self.enforce_proxy_state_on_profiles(all_profiles_list, all_profiles_map)
# Apply profile group policies (rotation, max_active). This will modify the local `all_profiles_map`. # Apply profile group policies (rotation, max_active). This will modify the local `all_profiles_map`.
self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map, args) self.enforce_profile_group_policies(getattr(args, 'profile_groups', []), all_profiles_map, args)
@ -197,12 +200,26 @@ class PolicyEnforcer:
live_active_counts[group_name] = count live_active_counts[group_name] = count
logger.debug(f"Initial live active counts: {live_active_counts}") logger.debug(f"Initial live active counts: {live_active_counts}")
# --- New Global Max Active Logic --- # --- New Enforcement Pool and Global Max Active Logic ---
enforcement_pools = getattr(args, 'enforcement_pools', [])
live_pool_active_counts = {}
if enforcement_pools:
for i, pool in enumerate(enforcement_pools):
pool_name = pool.get('name', f'pool_{i}')
live_pool_active_counts[pool_name] = 0
for group_name, count in live_active_counts.items():
group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
pool_name = group_policy.get('pool_name')
if pool_name:
live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + count
logger.debug(f"Initial live pool active counts: {live_pool_active_counts}")
global_max_active = getattr(args, 'global_max_active_profiles', 0) global_max_active = getattr(args, 'global_max_active_profiles', 0)
live_global_active_count = sum(live_active_counts.values()) live_global_active_count = sum(live_active_counts.values())
if global_max_active > 0: if global_max_active > 0:
logger.debug(f"Enforcing global max active profiles limit of {global_max_active}. Current global active: {live_global_active_count}") logger.debug(f"Enforcing global max active profiles limit of {global_max_active}. Current global active: {live_global_active_count}")
# --- End New Global Logic --- # --- End New Logic ---
# --- End group logic setup --- # --- End group logic setup ---
@ -278,7 +295,7 @@ class PolicyEnforcer:
# --- End new logic --- # --- End new logic ---
# --- New Sorting Logic based on Profile Selection Strategy --- # --- New Sorting Logic based on Profile Selection Strategy ---
strategy = getattr(args, 'profile_selection_strategy', 'longest_idle') strategy = getattr(args, 'profile_selection_strategy', None)
if strategy == 'least_loaded' and profile_groups: if strategy == 'least_loaded' and profile_groups:
logger.debug("Applying 'least_loaded' profile selection strategy.") logger.debug("Applying 'least_loaded' profile selection strategy.")
# Separate profiles that are ready from those that are not # Separate profiles that are ready from those that are not
@ -345,10 +362,8 @@ class PolicyEnforcer:
profiles_to_check = sorted_ready_profiles + not_ready_profiles profiles_to_check = sorted_ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
else: # Default 'longest_idle' sort elif strategy == 'longest_idle':
if strategy not in ['longest_idle']: logger.debug("Applying 'longest_idle' profile selection strategy.")
logger.warning(f"Unknown or unhandled profile_selection_strategy '{strategy}'. Defaulting to 'longest_idle'.")
# Separate profiles that are ready to be activated from those still resting. # Separate profiles that are ready to be activated from those still resting.
# A profile waiting for downloads is NOT considered ready for activation. # A profile waiting for downloads is NOT considered ready for activation.
ready_profiles = [ ready_profiles = [
@ -369,6 +384,11 @@ class PolicyEnforcer:
# The final list to check will process all ready profiles first, then wait for the not-ready ones. # The final list to check will process all ready profiles first, then wait for the not-ready ones.
profiles_to_check = ready_profiles + not_ready_profiles profiles_to_check = ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
else: # Default sort (no strategy)
if strategy: # Log a warning if an unknown strategy was provided
logger.warning(f"Unknown profile_selection_strategy '{strategy}'. Using default FIFO sort by rest time.")
# Default to a simple FIFO sort based on when their rest period ends.
profiles_to_check.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', ''))))
# --- End New Sorting Logic --- # --- End New Sorting Logic ---
# --- New logic: Identify groups with waiting profiles --- # --- New logic: Identify groups with waiting profiles ---
@ -490,13 +510,30 @@ class PolicyEnforcer:
profile_name = profile['name'] profile_name = profile['name']
group_name = profile_to_group_map.get(profile_name) group_name = profile_to_group_map.get(profile_name)
# --- New Global Max Active Check --- # --- New Pool and Global Max Active Check ---
# This check prevents NEW profiles (in RESTING state) from becoming active if the global limit is reached. is_new_activation = profile['state'] == ProfileState.RESTING.value
# It allows COOLDOWN profiles to become active, as they are already part of the active count. if is_new_activation:
if global_max_active > 0 and live_global_active_count >= global_max_active and profile['state'] == ProfileState.RESTING.value: # Check pool limits first
if enforcement_pools and group_name:
group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
pool_name = group_policy.get('pool_name')
pool_config = None
for i, p in enumerate(enforcement_pools):
if p.get('name', f'pool_{i}') == pool_name:
pool_config = p
break
if pool_config:
pool_max_active = pool_config.get('max_active_profiles', 0)
current_pool_active = live_pool_active_counts.get(pool_name, 0)
if pool_max_active > 0 and current_pool_active >= pool_max_active:
logger.debug(f"Profile '{profile_name}' rest ended, but pool '{pool_name}' max active limit ({pool_max_active}) has been reached. Deferring activation.")
continue
# Then check global limit if it's still configured (for backward compatibility)
if global_max_active > 0 and live_global_active_count >= global_max_active:
logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.") logger.debug(f"Profile '{profile_name}' rest ended, but global max active limit ({global_max_active}) has been reached. Deferring activation.")
continue continue
# --- End New Global Check --- # --- End New Check ---
# --- Group-aware unrest check --- # --- Group-aware unrest check ---
if group_name: if group_name:
@ -577,11 +614,13 @@ class PolicyEnforcer:
continue # Skip activation for this profile continue # Skip activation for this profile
# --- End group check --- # --- End group check ---
# Before activating, ensure the profile's proxy is not resting. # Before activating, ensure the profile's proxy is not resting or banned.
proxy_url = profile.get('proxy') proxy_url = profile.get('proxy')
if proxy_url: if proxy_url:
proxy_state_data = proxy_states.get(proxy_url, {}) proxy_state_data = proxy_states.get(proxy_url, {})
if proxy_state_data.get('state') == ProfileState.RESTING.value: proxy_state = proxy_state_data.get('state')
if proxy_state == ProfileState.RESTING.value:
logger.debug(f"Profile '{profile['name']}' rest period ended, but its proxy '{proxy_url}' is still resting. Deferring activation.") logger.debug(f"Profile '{profile['name']}' rest period ended, but its proxy '{proxy_url}' is still resting. Deferring activation.")
# Update reason for clarity in the UI when a profile is blocked by its proxy. # Update reason for clarity in the UI when a profile is blocked by its proxy.
@ -596,10 +635,32 @@ class PolicyEnforcer:
continue # Do not activate this profile yet. continue # Do not activate this profile yet.
# Update group counter BEFORE making any changes, so subsequent checks in this cycle use the updated count elif proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value:
# This profile is about to be activated, but its proxy is banned. Ban it.
reason = f"Proxy '{proxy_url}' is BANNED"
logger.warning(f"Banning profile '{profile['name']}' because its proxy is banned: {reason}")
self.actions_taken_this_cycle += 1
if not self.dry_run:
sm = self.manager.get_state_machine(profile_name)
if sm:
sm.ban(reason=reason)
# Update local map
all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value
all_profiles_map[profile_name]['reason'] = reason
continue # Skip activation; it is now banned.
# Update group and pool counters BEFORE making any changes, so subsequent checks in this cycle use the updated count
if group_name and profile['state'] == ProfileState.RESTING.value: if group_name and profile['state'] == ProfileState.RESTING.value:
# For RESTING profiles, they're becoming active, so increment the count # For RESTING profiles, they're becoming active, so increment the count
live_active_counts[group_name] = live_active_counts.get(group_name, 0) + 1 live_active_counts[group_name] = live_active_counts.get(group_name, 0) + 1
# Also increment the pool counter
if enforcement_pools:
group_policy = next((g for g in profile_groups if g.get('name') == group_name), {})
pool_name = group_policy.get('pool_name')
if pool_name:
live_pool_active_counts[pool_name] = live_pool_active_counts.get(pool_name, 0) + 1
# Also increment the global counter # Also increment the global counter
if global_max_active > 0: if global_max_active > 0:
live_global_active_count += 1 live_global_active_count += 1
@ -903,20 +964,68 @@ class PolicyEnforcer:
if num_active_or_locked == 0: if num_active_or_locked == 0:
logger.debug(f"Group '{group_name}' has no active profiles. `enforce_unrest_policy` will attempt to activate one.") logger.debug(f"Group '{group_name}' has no active profiles. `enforce_unrest_policy` will attempt to activate one.")
# --- 4. Global Self-Healing: Enforce global_max_active_profiles --- # --- 4. Pool and Global Self-Healing ---
# This runs after all per-group healing and ensures the global limit is respected. enforcement_pools = getattr(args, 'enforcement_pools', [])
if enforcement_pools:
for i, pool in enumerate(enforcement_pools):
pool_name = pool.get('name', f'pool_{i}')
pool_max_active = pool.get('max_active_profiles', 0)
if not pool_max_active or pool_max_active <= 0:
continue
# Get all profile names belonging to this pool
pool_profile_names = set()
for group in profile_groups:
if group.get('pool_name') == pool_name:
prefix = group.get('prefix')
if prefix:
for p_name in all_profiles_map:
if p_name.startswith(prefix):
pool_profile_names.add(p_name)
# Get current active count for this pool from our local map
current_pool_active = [
p for name, p in all_profiles_map.items()
if name in pool_profile_names and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]
]
num_pool_active = len(current_pool_active)
if num_pool_active > pool_max_active:
logger.warning(f"Pool Healing ('{pool_name}'): Found {num_pool_active} active profiles, but pool max is {pool_max_active}. Resting excess.")
profiles_that_can_be_rested = [p for p in current_pool_active if p['state'] == ProfileState.ACTIVE.value]
profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True)
profiles_that_can_be_rested.sort(key=lambda p: (
p.get('success_count', 0) + p.get('failure_count', 0) +
p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)
), reverse=True)
num_to_rest = num_pool_active - pool_max_active
for profile in profiles_that_can_be_rested[:num_to_rest]:
logger.warning(f"Pool Healing ('{pool_name}'): Resting profile '{profile['name']}'.")
self.actions_taken_this_cycle += 1
if not self.dry_run:
sm = self.manager.get_state_machine(profile['name'])
if sm:
sm.rest(reason=f"Pool '{pool_name}' max_active healing", duration_minutes=0.02)
all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value
all_profiles_map[profile['name']]['rest_reason'] = f"Pool '{pool_name}' max_active healing"
all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60)
# For backward compatibility, also enforce global_max_active_profiles if it is set.
global_max_active = getattr(args, 'global_max_active_profiles', 0) global_max_active = getattr(args, 'global_max_active_profiles', 0)
if global_max_active > 0: if global_max_active > 0:
# Get all profiles managed by any group # Get all profiles managed by any group
all_grouped_profiles = set() all_grouped_profiles = set()
for group in profile_groups: for group in profile_groups:
profiles_in_group = set() if 'prefix' in group:
if 'profiles' in group:
profiles_in_group = set(group['profiles'])
elif 'prefix' in group:
prefix = group['prefix'] prefix = group['prefix']
profiles_in_group = {p['name'] for p in all_profiles_list if p['name'].startswith(prefix)} for p_name in all_profiles_map:
all_grouped_profiles.update(profiles_in_group) if p_name.startswith(prefix):
all_grouped_profiles.add(p_name)
elif 'profiles' in group:
all_grouped_profiles.update(group['profiles'])
# Get current active count across all groups from our local map # Get current active count across all groups from our local map
current_global_active = [ current_global_active = [
@ -928,29 +1037,23 @@ class PolicyEnforcer:
if num_global_active > global_max_active: if num_global_active > global_max_active:
logger.warning(f"Global Healing: Found {num_global_active} active profiles across all groups, but global max is {global_max_active}. Resting excess.") logger.warning(f"Global Healing: Found {num_global_active} active profiles across all groups, but global max is {global_max_active}. Resting excess.")
# We can only rest profiles that are in the ACTIVE state, not LOCKED.
profiles_that_can_be_rested = [p for p in current_global_active if p['state'] == ProfileState.ACTIVE.value] profiles_that_can_be_rested = [p for p in current_global_active if p['state'] == ProfileState.ACTIVE.value]
profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True)
# Sort to determine which profiles to rest, using the same logic as per-group healing.
profiles_that_can_be_rested.sort(key=lambda p: natural_sort_key(p.get('name', '')), reverse=True) # Higher name first
profiles_that_can_be_rested.sort(key=lambda p: ( profiles_that_can_be_rested.sort(key=lambda p: (
p.get('success_count', 0) + p.get('failure_count', 0) + p.get('success_count', 0) + p.get('failure_count', 0) +
p.get('tolerated_error_count', 0) + p.get('tolerated_error_count', 0) +
p.get('download_count', 0) + p.get('download_error_count', 0) p.get('download_count', 0) + p.get('download_error_count', 0)
), reverse=True) # Most requests first ), reverse=True)
num_to_rest = num_global_active - global_max_active num_to_rest = num_global_active - global_max_active
profiles_to_rest = profiles_that_can_be_rested[:num_to_rest] for profile in profiles_that_can_be_rested[:num_to_rest]:
for profile in profiles_to_rest:
logger.warning(f"Global Healing: Resting profile '{profile['name']}'.") logger.warning(f"Global Healing: Resting profile '{profile['name']}'.")
self.actions_taken_this_cycle += 1 self.actions_taken_this_cycle += 1
if not self.dry_run: if not self.dry_run:
sm = self.manager.get_state_machine(profile['name']) sm = self.manager.get_state_machine(profile['name'])
if sm: if sm:
# Rest for a minimal duration to prevent immediate re-activation in the same cycle. sm.rest(reason="Global max_active healing", duration_minutes=0.02)
sm.rest(reason="Global max_active healing", duration_minutes=0.02) # ~1.2 seconds
# Update local map to reflect the change for this cycle
all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value all_profiles_map[profile['name']]['state'] = ProfileState.RESTING.value
all_profiles_map[profile['name']]['rest_reason'] = "Global max_active healing" all_profiles_map[profile['name']]['rest_reason'] = "Global max_active healing"
all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60) all_profiles_map[profile['name']]['rest_until'] = time.time() + (0.02 * 60)
@ -1062,6 +1165,10 @@ class PolicyEnforcer:
for proxy_url, state_data in proxy_states.items(): for proxy_url, state_data in proxy_states.items():
state = state_data.get('state', ProfileState.ACTIVE.value) state = state_data.get('state', ProfileState.ACTIVE.value)
if state == ProfileState.BANNED.value:
logger.debug(f"Proxy '{proxy_url}' is BANNED. Skipping work/rest cycle enforcement.")
continue
# Un-rest logic # Un-rest logic
if state == ProfileState.RESTING.value: if state == ProfileState.RESTING.value:
rest_until = state_data.get('rest_until', 0) rest_until = state_data.get('rest_until', 0)
@ -1257,6 +1364,60 @@ class PolicyEnforcer:
return True # Indicates action was taken return True # Indicates action was taken
return False return False
def enforce_proxy_state_on_profiles(self, all_profiles_list, all_profiles_map):
"""
Enforces the state of a proxy onto all profiles that use it.
- If a proxy is BANNED, any non-banned profile using it will be banned.
- If a proxy is RESTING, any ACTIVE profile using it will be rested.
This is a safeguard that runs after all proxy state changes and before profile
state logic.
"""
unique_proxies = sorted(list(set(p['proxy'] for p in all_profiles_list if p.get('proxy'))))
if not unique_proxies:
return
proxy_states = self.manager.get_proxy_states(unique_proxies)
for profile in all_profiles_list:
proxy_url = profile.get('proxy')
if not proxy_url:
continue
proxy_state_data = proxy_states.get(proxy_url)
if not proxy_state_data:
continue
proxy_state = proxy_state_data.get('state')
profile_name = profile['name']
if proxy_state == ProfileState.BANNED.value and profile['state'] != ProfileState.BANNED.value:
reason = f"Proxy '{proxy_url}' is BANNED"
logger.warning(f"Banning profile '{profile_name}' because its proxy is banned: {reason}")
self.actions_taken_this_cycle += 1
if not self.dry_run:
sm = self.manager.get_state_machine(profile_name)
if sm:
sm.ban(reason=reason)
# Update local map for consistency in this cycle
all_profiles_map[profile_name]['state'] = ProfileState.BANNED.value
all_profiles_map[profile_name]['reason'] = reason
elif proxy_state == ProfileState.RESTING.value and profile['state'] == ProfileState.ACTIVE.value:
logger.info(f"Resting profile '{profile_name}' because its proxy '{proxy_url}' is resting.")
self.actions_taken_this_cycle += 1
if not self.dry_run:
# Rest it for as long as the proxy is resting.
proxy_rest_until = proxy_state_data.get('rest_until', 0)
duration_minutes = max(0, (proxy_rest_until - time.time()) / 60)
sm = self.manager.get_state_machine(profile_name)
if sm:
sm.rest(reason=self.PROXY_REST_REASON, duration_minutes=duration_minutes)
# Update local map for consistency in this cycle
proxy_rest_until = proxy_state_data.get('rest_until', 0)
all_profiles_map[profile_name]['state'] = ProfileState.RESTING.value
all_profiles_map[profile_name]['rest_reason'] = self.PROXY_REST_REASON
all_profiles_map[profile_name]['rest_until'] = proxy_rest_until
def add_policy_enforcer_parser(subparsers): def add_policy_enforcer_parser(subparsers):
"""Adds the parser for the 'policy-enforcer' command.""" """Adds the parser for the 'policy-enforcer' command."""
parser = subparsers.add_parser( parser = subparsers.add_parser(
@ -1421,6 +1582,10 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
logger.debug("Syncing active profiles from Auth to Download simulation...") logger.debug("Syncing active profiles from Auth to Download simulation...")
# Get all download proxy states once for efficiency
all_dl_proxies = sorted(list(set(p['proxy'] for p in all_download_profiles.values() if p.get('proxy'))))
all_dl_proxy_states = download_manager.get_proxy_states(all_dl_proxies)
# Get profiles that should be active in the download simulation # Get profiles that should be active in the download simulation
target_active_download_profiles = set() target_active_download_profiles = set()
@ -1465,6 +1630,13 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
logger.warning(f"Auth profile '{target_profile_name}' needs an active download profile, but no corresponding download profile found.") logger.warning(f"Auth profile '{target_profile_name}' needs an active download profile, but no corresponding download profile found.")
continue continue
# Check proxy state before activating
proxy_url = download_profile.get('proxy')
proxy_state = all_dl_proxy_states.get(proxy_url, {}).get('state')
if proxy_state in [ProfileState.BANNED.value, ProfileState.RESTING.value]:
logger.debug(f"Sync: Deferring activation of download profile '{target_profile_name}' because its proxy '{proxy_url}' is {proxy_state}.")
continue
if download_profile['state'] not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: if download_profile['state'] not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
is_from_cooldown = download_profile['state'] == ProfileState.COOLDOWN.value is_from_cooldown = download_profile['state'] == ProfileState.COOLDOWN.value
log_msg_suffix = " (from COOLDOWN)" if is_from_cooldown else "" log_msg_suffix = " (from COOLDOWN)" if is_from_cooldown else ""
@ -1476,24 +1648,12 @@ def sync_cross_simulation(auth_manager, download_manager, sync_config, dry_run=F
sm.activate(profile=download_profile) sm.activate(profile=download_profile)
# --- Group-Aware Deactivation --- # --- Group-Aware Deactivation ---
# Identify the target download groups based on target_active_download_profiles. # Deactivate any download profiles that are active but are not the target profile for their group.
# CRITICAL FIX: Directly map individual profiles to their groups instead of relying on name patterns. # This ensures that if auth wants 'user1_1' to be active, the currently active 'user1_0' is rested first.
target_download_groups = set()
for target_profile_name in target_active_download_profiles:
group_info = dl_profile_to_group.get(target_profile_name)
if group_info:
target_download_groups.add(group_info['name'])
logger.debug(f"Target download groups for this sync cycle: {target_download_groups}")
# Deactivate any download profiles that are active but are not in a target group
for dl_profile_name, dl_profile in all_download_profiles.items(): for dl_profile_name, dl_profile in all_download_profiles.items():
if dl_profile['state'] == ProfileState.ACTIVE.value: if dl_profile['state'] == ProfileState.ACTIVE.value:
group_info = dl_profile_to_group.get(dl_profile_name) if dl_profile_name not in target_active_download_profiles:
# If the profile is in a group, and that group is NOT a target group, rest it. logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as it is no longer the target active profile for its group.")
if group_info and group_info['name'] not in target_download_groups:
logger.info(f"Syncing active state: Resting download profile '{dl_profile_name}' as its group '{group_info['name']}' is no longer active.")
if not dry_run: if not dry_run:
sm = download_manager.get_state_machine(dl_profile_name) sm = download_manager.get_state_machine(dl_profile_name)
if sm: if sm:
@ -1539,9 +1699,9 @@ def main_policy_enforcer(args):
'unlock_stale_locks_after_seconds': 120, 'unlock_stale_locks_after_seconds': 120,
'unlock_cooldown_seconds': 0, 'unlock_cooldown_seconds': 0,
'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10, 'max_global_proxy_active_minutes': 0, 'rest_duration_on_max_active': 10,
'profile_selection_strategy': 'longest_idle', 'profile_selection_strategy': None,
'global_max_active_profiles': 0, 'global_max_active_profiles': 0,
'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [] 'interval_seconds': 60, 'proxy_groups': [], 'profile_groups': [], 'enforcement_pools': []
} }
sim_params = policy.get('simulation_parameters', {}) sim_params = policy.get('simulation_parameters', {})
@ -1586,10 +1746,90 @@ def main_policy_enforcer(args):
logger.info(f"Setting up enforcer for {sim_type} simulation...") logger.info(f"Setting up enforcer for {sim_type} simulation...")
# --- Dynamic Profile Group Discovery --- # --- Hybrid Profile Group Discovery (Static + Dynamic) ---
profile_group_templates = policy.get('profile_group_templates') common_group_settings = policy.get('common_group_settings', {})
# Check if templates exist and if the config block doesn't already have groups (CLI overrides take precedence) enforcement_pools = policy.get('enforcement_pools')
if profile_group_templates and 'profile_groups' not in policy_config: profile_group_templates = policy.get('profile_group_templates') # For backward compatibility
# Start with any statically defined groups from the policy.
final_profile_groups = policy_config.get('profile_groups', [])
# If enforcement_pools are defined, discover dynamic groups and merge them.
if enforcement_pools:
logger.info(f"Found 'enforcement_pools'. Discovering dynamic profile groups to merge with static ones for {sim_type}...")
# Determine key_prefix to connect to the right Redis env
policy_env = sim_params.get(env_policy_key)
default_policy_env = sim_params.get('env')
effective_env = env_cli_arg or args.env or policy_env or default_policy_env or 'dev'
if args.key_prefix: temp_key_prefix = args.key_prefix
elif args.legacy: temp_key_prefix = 'profile_mgmt_'
else: temp_key_prefix = f"{effective_env}_profile_mgmt_"
try:
temp_manager = ProfileManager(redis_host, redis_port, redis_password, temp_key_prefix, redis_db)
all_profiles = temp_manager.list_profiles()
found_prefixes = set(p['name'].rsplit('_', 1)[0] for p in all_profiles)
if not found_prefixes:
logger.warning(f"Dynamic discovery found no profile prefixes for env '{effective_env}'.")
else:
logger.info(f"Discovered {len(found_prefixes)} unique profile prefixes: {sorted(list(found_prefixes))}")
dynamically_generated_groups = []
# Match discovered prefixes against patterns in each enforcement pool
for i, pool in enumerate(enforcement_pools):
pool_name = pool.get('name', f'pool_{i}')
pool_patterns = pool.get('profile_group_patterns', [])
# If a pool has no patterns, it's just for defining concurrency for static groups. Skip discovery.
if not pool_patterns:
continue
# Merge common settings with any pool-specific overrides
group_settings_template = deepcopy(common_group_settings)
pool_specific_settings = pool.get('group_settings', {})
# A simple way to deep merge the two levels (auth/download)
auth_settings = group_settings_template.get('auth', {})
auth_settings.update(pool_specific_settings.get('auth', {}))
group_settings_template['auth'] = auth_settings
download_settings = group_settings_template.get('download', {})
download_settings.update(pool_specific_settings.get('download', {}))
group_settings_template['download'] = download_settings
for prefix in sorted(list(found_prefixes)):
for pattern in pool_patterns:
if fnmatch.fnmatch(prefix, pattern):
sim_settings = group_settings_template.get(sim_type.lower())
if not sim_settings:
logger.debug(f"Pool '{pool_name}' has no settings for '{sim_type}'. Skipping for prefix '{prefix}'.")
continue
new_group = deepcopy(sim_settings)
new_group['prefix'] = prefix
new_group['name'] = prefix
new_group['pool_name'] = pool_name
dynamically_generated_groups.append(new_group)
logger.debug(f"Assigned prefix '{prefix}' to pool '{pool_name}' for {sim_type} simulation.")
break
if dynamically_generated_groups:
logger.info(f"Merging {len(final_profile_groups)} static group(s) with {len(dynamically_generated_groups)} discovered dynamic group(s).")
final_profile_groups.extend(dynamically_generated_groups)
except Exception as e:
logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose)
# Update the policy_config with the final merged list and the pool definitions
policy_config['profile_groups'] = final_profile_groups
# CRITICAL: Deepcopy enforcement_pools to prevent modification in one simulation
# from affecting the other, since the policy object is shared.
policy_config['enforcement_pools'] = deepcopy(enforcement_pools)
# For backward compatibility with the old template format
elif profile_group_templates and 'profile_groups' not in policy_config:
logger.info(f"Found 'profile_group_templates'. Discovering profile groups dynamically for {sim_type}...") logger.info(f"Found 'profile_group_templates'. Discovering profile groups dynamically for {sim_type}...")
# Determine key_prefix to connect to the right Redis env (logic duplicated from below) # Determine key_prefix to connect to the right Redis env (logic duplicated from below)
@ -1640,6 +1880,21 @@ def main_policy_enforcer(args):
except Exception as e: except Exception as e:
logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose) logger.error(f"Failed during dynamic profile group discovery: {e}", exc_info=args.verbose)
# In the download simulation, the active profiles are dictated entirely by the
# cross-simulation sync logic. We must disable the download enforcer's own
# concurrency limits (max_active_profiles) to prevent it from "healing"
# profiles that the sync logic has correctly activated.
if sim_type == 'Download':
logger.info("Disabling max_active_profiles limits for Download simulation. Active profiles will be managed by cross-sim sync.")
if 'profile_groups' in policy_config:
for group in policy_config['profile_groups']:
group['max_active_profiles'] = 0
if 'enforcement_pools' in policy_config:
for pool in policy_config['enforcement_pools']:
pool['max_active_profiles'] = 0
# Also disable the global limit for the download simulation.
policy_config['global_max_active_profiles'] = 0
config = Config(args, policy_config, code_defaults) config = Config(args, policy_config, code_defaults)
# Determine the effective environment name with correct precedence: # Determine the effective environment name with correct precedence:

View File

@ -18,8 +18,10 @@ import time
from datetime import datetime from datetime import datetime
from typing import Dict, List, Optional, Any from typing import Dict, List, Optional, Any
import collections import collections
import fnmatch
import redis import redis
import yaml
from .profile_statemachine import ProfileState, ProfileStateMachine from .profile_statemachine import ProfileState, ProfileStateMachine
@ -199,8 +201,8 @@ class ProfileManager:
# When decrementing, ensure the counter exists to avoid creating negative counters from stray calls. # When decrementing, ensure the counter exists to avoid creating negative counters from stray calls.
if count < 0 and not self.redis.exists(key): if count < 0 and not self.redis.exists(key):
logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. No action taken.") logger.warning(f"Attempted to decrement pending downloads for '{profile_name}' by {abs(count)}, but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.")
return None return 0
new_value = self.redis.incrby(key, count) new_value = self.redis.incrby(key, count)
@ -225,8 +227,8 @@ class ProfileManager:
# Only decrement if the key exists. This prevents stray calls from creating negative counters. # Only decrement if the key exists. This prevents stray calls from creating negative counters.
if not self.redis.exists(key): if not self.redis.exists(key):
logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. No action taken.") logger.warning(f"Attempted to decrement pending downloads for '{profile_name}', but no counter exists. This can happen in a race condition. Assuming task is complete and counter is zero.")
return None return 0
new_value = self.redis.decr(key) new_value = self.redis.decr(key)
@ -444,7 +446,12 @@ class ProfileManager:
logger.error(f"Invalid state: {new_state}") logger.error(f"Invalid state: {new_state}")
return False return False
sm = self.get_state_machine(name) profile = self.get_profile(name)
if not profile:
# get_profile already logs an error if the profile is not found.
return False
sm = self.get_state_machine(name, profile=profile)
if not sm: if not sm:
return False # get_state_machine logs the error return False # get_state_machine logs the error
@ -453,14 +460,16 @@ class ProfileManager:
return True return True
try: try:
# Pass the profile object to the transition methods for context,
# which is consistent with the policy enforcer's usage.
if new_state == ProfileState.ACTIVE.value: if new_state == ProfileState.ACTIVE.value:
sm.activate() sm.activate(profile=profile)
elif new_state == ProfileState.BANNED.value: elif new_state == ProfileState.BANNED.value:
sm.ban(reason=reason) sm.ban(reason=reason, profile=profile)
elif new_state == ProfileState.RESTING.value: elif new_state == ProfileState.RESTING.value:
sm.rest(reason=reason) sm.rest(reason=reason, profile=profile)
elif new_state == ProfileState.PAUSED.value: elif new_state == ProfileState.PAUSED.value:
sm.pause(reason=reason) sm.pause(reason=reason, profile=profile)
# LOCKED and COOLDOWN are not handled here as they are special transitions # LOCKED and COOLDOWN are not handled here as they are special transitions
# from lock_profile and unlock_profile, and should not be set directly. # from lock_profile and unlock_profile, and should not be set directly.
elif new_state in [ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]: elif new_state in [ProfileState.LOCKED.value, ProfileState.COOLDOWN.value]:
@ -720,30 +729,51 @@ class ProfileManager:
logger.info(f"Deleted {deleted_count} global counter key(s).") logger.info(f"Deleted {deleted_count} global counter key(s).")
return deleted_count return deleted_count
def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None) -> bool: def set_proxy_state(self, proxy_url: str, state: str, rest_duration_minutes: Optional[int] = None, reason: Optional[str] = None) -> bool:
"""Set the state of a proxy and propagates it to associated profiles.""" """Set the state of a proxy and propagates it to associated profiles."""
if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value]: if state not in [ProfileState.ACTIVE.value, ProfileState.RESTING.value, ProfileState.BANNED.value]:
logger.error(f"Invalid proxy state: {state}. Only ACTIVE and RESTING are supported for proxies.") logger.error(f"Invalid proxy state: {state}. Only ACTIVE, RESTING, and BANNED are supported for proxies.")
return False return False
proxy_key = self._proxy_state_key(proxy_url) proxy_key = self._proxy_state_key(proxy_url)
now = time.time() now = time.time()
updates = {'state': state} updates = {'state': state}
if reason:
updates['reason'] = reason
else:
# Clear reason if not provided
updates['reason'] = ''
rest_until = 0 rest_until = 0
if state == ProfileState.RESTING.value: if state == ProfileState.RESTING.value:
if not rest_duration_minutes or rest_duration_minutes <= 0: if rest_duration_minutes is None:
logger.error("rest_duration_minutes is required when setting proxy state to RESTING.") logger.error("rest_duration_minutes is required when setting proxy state to RESTING.")
return False return False
if rest_duration_minutes == -1:
# Use a very large number for "indefinite" to avoid special cases later.
# 10 years should be sufficient.
rest_until = now + (10 * 365 * 24 * 60 * 60)
elif rest_duration_minutes > 0:
rest_until = now + rest_duration_minutes * 60 rest_until = now + rest_duration_minutes * 60
else:
logger.error("rest_duration_minutes must be positive, or -1 for indefinite.")
return False
updates['rest_until'] = str(rest_until) updates['rest_until'] = str(rest_until)
updates['work_start_timestamp'] = '0' # Clear work start time updates['work_start_timestamp'] = '0' # Clear work start time
else: # ACTIVE elif state in [ProfileState.ACTIVE.value, ProfileState.BANNED.value]:
updates['rest_until'] = '0' updates['rest_until'] = '0'
if state == ProfileState.ACTIVE.value:
updates['work_start_timestamp'] = str(now) updates['work_start_timestamp'] = str(now)
else: # BANNED
updates['work_start_timestamp'] = '0'
self.redis.hset(proxy_key, mapping=updates) self.redis.hset(proxy_key, mapping=updates)
logger.info(f"Set proxy '{proxy_url}' state to {state}.") log_msg = f"Set proxy '{proxy_url}' state to {state}."
if reason:
log_msg += f" Reason: {reason}"
logger.info(log_msg)
# Now, update associated profiles # Now, update associated profiles
profiles_on_proxy = self.list_profiles(proxy_filter=proxy_url) profiles_on_proxy = self.list_profiles(proxy_filter=proxy_url)
@ -751,16 +781,31 @@ class ProfileManager:
return True return True
if state == ProfileState.RESTING.value: if state == ProfileState.RESTING.value:
logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'.") propagate_reason = reason or "Proxy resting"
logger.info(f"Propagating RESTING state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
for profile in profiles_on_proxy: for profile in profiles_on_proxy:
if profile['state'] == ProfileState.ACTIVE.value: if profile['state'] == ProfileState.ACTIVE.value:
self.update_profile_state(profile['name'], ProfileState.RESTING.value, "Proxy resting") self.update_profile_state(profile['name'], ProfileState.RESTING.value, propagate_reason)
self.update_profile_field(profile['name'], 'rest_until', str(rest_until)) self.update_profile_field(profile['name'], 'rest_until', str(rest_until))
elif state == ProfileState.ACTIVE.value: elif state == ProfileState.BANNED.value:
logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'.") propagate_reason = reason or "Proxy banned"
logger.info(f"Propagating BANNED state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
for profile in profiles_on_proxy: for profile in profiles_on_proxy:
if profile['state'] == ProfileState.RESTING.value and profile.get('rest_reason') == "Proxy resting": if profile['state'] != ProfileState.BANNED.value:
self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, "Proxy activated") self.update_profile_state(profile['name'], ProfileState.BANNED.value, propagate_reason)
elif state == ProfileState.ACTIVE.value:
propagate_reason = reason or "Proxy activated"
logger.info(f"Propagating ACTIVE state to profiles on proxy '{proxy_url}'. Reason: {propagate_reason}")
for profile in profiles_on_proxy:
# Check for proxy-related reasons in both rest_reason and ban_reason
proxy_related_reason = False
if profile.get('rest_reason', '').startswith("Proxy "):
proxy_related_reason = True
if profile.get('ban_reason', '').startswith("Proxy "):
proxy_related_reason = True
if (profile['state'] in [ProfileState.RESTING.value, ProfileState.BANNED.value]) and proxy_related_reason:
self.update_profile_state(profile['name'], ProfileState.ACTIVE.value, propagate_reason)
return True return True
@ -1162,8 +1207,8 @@ def add_profile_manager_parser(subparsers):
# List command # List command
list_parser = subparsers.add_parser('list', help='List profiles', parents=[common_parser]) list_parser = subparsers.add_parser('list', help='List profiles', parents=[common_parser])
list_parser.add_argument('--auth-env', help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view.') list_parser.add_argument('--auth-env', default=None, help='Environment name for the Auth simulation monitor. Use with --download-env for a merged view. Defaults to YTOPS_AUTH_ENV env var.')
list_parser.add_argument('--download-env', help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view.') list_parser.add_argument('--download-env', default=None, help='Environment name for the Download simulation monitor. Use with --auth-env for a merged view. Defaults to YTOPS_DOWNLOAD_ENV env var.')
list_parser.add_argument('--separate-views', action='store_true', help='In dual-monitor mode, show two separate reports instead of a single merged view.') list_parser.add_argument('--separate-views', action='store_true', help='In dual-monitor mode, show two separate reports instead of a single merged view.')
list_parser.add_argument('--rest-after-requests', type=int, help='(For display) Show countdown to rest based on this request limit.') list_parser.add_argument('--rest-after-requests', type=int, help='(For display) Show countdown to rest based on this request limit.')
list_parser.add_argument('--state', help='Filter by state') list_parser.add_argument('--state', help='Filter by state')
@ -1177,20 +1222,24 @@ def add_profile_manager_parser(subparsers):
list_parser.add_argument('--no-blink', action='store_true', help='Use ANSI escape codes for smoother screen updates in --live mode (experimental).') list_parser.add_argument('--no-blink', action='store_true', help='Use ANSI escape codes for smoother screen updates in --live mode (experimental).')
list_parser.add_argument('--interval-seconds', type=int, default=5, help='When in --live mode, how often to refresh in seconds. Default: 5.') list_parser.add_argument('--interval-seconds', type=int, default=5, help='When in --live mode, how often to refresh in seconds. Default: 5.')
list_parser.add_argument('--hide-active-state', action='store_true', help="Display 'ACTIVE' state as blank for cleaner UI.") list_parser.add_argument('--hide-active-state', action='store_true', help="Display 'ACTIVE' state as blank for cleaner UI.")
list_parser.add_argument('--hide-ungrouped', action='store_true', help="Hide profiles that do not belong to any configured profile group (e.g., old profiles after a config change). Shown by default.")
# Get command # Get command
get_parser = subparsers.add_parser('get', help='Get profile details', parents=[common_parser]) get_parser = subparsers.add_parser('get', help='Get profile details', parents=[common_parser])
get_parser.add_argument('name', help='Profile name') get_parser.add_argument('name', help='Profile name')
# Set proxy state command # Set proxy state command
set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy and propagate to its profiles.', parents=[common_parser]) set_proxy_state_parser = subparsers.add_parser('set-proxy-state', help='Set the state of a proxy (or proxies) and propagate to its profiles.', parents=[common_parser])
set_proxy_state_parser.add_argument('proxy_url', help='Proxy URL') set_proxy_state_parser.add_argument('proxy_urls', help='Proxy URL, or comma-separated list of URLs')
set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING'], help='New state for the proxy') set_proxy_state_parser.add_argument('state', choices=['ACTIVE', 'RESTING', 'BANNED'], help='New state for the proxy')
set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state') set_proxy_state_parser.add_argument('--duration-minutes', type=int, help='Duration for the RESTING state. Use -1 for indefinite rest.')
set_proxy_state_parser.add_argument('--reason', help='Reason for the state change. Propagated to profiles.')
set_proxy_state_parser.add_argument('--auth-env', default=None, help='Target the Auth simulation environment. Can be used with --download-env. Defaults to YTOPS_AUTH_ENV env var.')
set_proxy_state_parser.add_argument('--download-env', default=None, help='Target the Download simulation environment. Can be used with --auth-env. Defaults to YTOPS_DOWNLOAD_ENV env var.')
# Update state command # Update state command
update_state_parser = subparsers.add_parser('update-state', help='Update profile state', parents=[common_parser]) update_state_parser = subparsers.add_parser('update-state', help='Update profile state for one or more profiles.', parents=[common_parser])
update_state_parser.add_argument('name', help='Profile name') update_state_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
update_state_parser.add_argument('state', choices=ProfileState.values(), update_state_parser.add_argument('state', choices=ProfileState.values(),
help='New state') help='New state')
update_state_parser.add_argument('--reason', help='Reason for state change (especially for BAN)') update_state_parser.add_argument('--reason', help='Reason for state change (especially for BAN)')
@ -1202,21 +1251,21 @@ def add_profile_manager_parser(subparsers):
update_field_parser.add_argument('value', help='New value') update_field_parser.add_argument('value', help='New value')
# Pause command (convenience) # Pause command (convenience)
pause_parser = subparsers.add_parser('pause', help=f'Pause a profile (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser]) pause_parser = subparsers.add_parser('pause', help=f'Pause one or more profiles (sets state to {ProfileState.PAUSED.value}).', parents=[common_parser])
pause_parser.add_argument('name', help='Profile name') pause_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
# Activate command (convenience) # Activate command (convenience)
activate_parser = subparsers.add_parser('activate', help=f'Activate a profile (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser]) activate_parser = subparsers.add_parser('activate', help=f'Activate one or more profiles (sets state to {ProfileState.ACTIVE.value}). Useful for resuming a PAUSED profile or fixing a stale LOCKED one.', parents=[common_parser])
activate_parser.add_argument('name', help='Profile name') activate_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
# Ban command (convenience) # Ban command (convenience)
ban_parser = subparsers.add_parser('ban', help=f'Ban a profile (sets state to {ProfileState.BANNED.value}).', parents=[common_parser]) ban_parser = subparsers.add_parser('ban', help=f'Ban one or more profiles (sets state to {ProfileState.BANNED.value}).', parents=[common_parser])
ban_parser.add_argument('name', help='Profile name') ban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
ban_parser.add_argument('--reason', required=True, help='Reason for ban') ban_parser.add_argument('--reason', required=True, help='Reason for ban')
# Unban command (convenience) # Unban command (convenience)
unban_parser = subparsers.add_parser('unban', help=f'Unban a profile (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser]) unban_parser = subparsers.add_parser('unban', help=f'Unban one or more profiles (sets state to {ProfileState.ACTIVE.value} and resets session counters).', parents=[common_parser])
unban_parser.add_argument('name', help='Profile name') unban_parser.add_argument('names', help='Profile name, comma-separated list of names, or a pattern with wildcards (e.g., "user31_*")')
# Delete command # Delete command
delete_parser = subparsers.add_parser('delete', help='Delete a profile', parents=[common_parser]) delete_parser = subparsers.add_parser('delete', help='Delete a profile', parents=[common_parser])
@ -1228,6 +1277,15 @@ def add_profile_manager_parser(subparsers):
delete_all_parser = subparsers.add_parser('delete-all', help='(Destructive) Delete all profiles and data under the current key prefix.', parents=[common_parser]) delete_all_parser = subparsers.add_parser('delete-all', help='(Destructive) Delete all profiles and data under the current key prefix.', parents=[common_parser])
delete_all_parser.add_argument('--confirm', action='store_true', help='Confirm this highly destructive action (required)') delete_all_parser.add_argument('--confirm', action='store_true', help='Confirm this highly destructive action (required)')
# Cleanup ungrouped command
cleanup_parser = subparsers.add_parser('cleanup-ungrouped', help='(Safe) Sync profiles in Redis with the setup policy (create missing, delete extra).', parents=[common_parser])
cleanup_parser.add_argument('--policy-file', required=True, help='Path to the profile setup policy YAML file (e.g., policies/6_profile_setup_policy.yaml).')
cleanup_parser.add_argument('--auth-env', default=None, help="Environment name for the Auth simulation to clean. Defaults to YTOPS_AUTH_ENV env var.")
cleanup_parser.add_argument('--download-env', default=None, help="Environment name for the Download simulation to clean. Defaults to YTOPS_DOWNLOAD_ENV env var.")
cleanup_parser.add_argument('--dry-run', action='store_true', help="Only show which profiles would be created or deleted, don't actually change them.")
cleanup_parser.add_argument('--no-create-missing', action='store_true', help="Only delete ungrouped profiles, do not create profiles missing from Redis.")
cleanup_parser.add_argument('--disallow-cleanup-active-downloads', action='store_true', help="In paired auth/download cleanup, PREVENTS deleting a profile if its download side is ACTIVE, even if the auth side is idle. Cleanup of active downloads is allowed by default.")
# Reset global counters command # Reset global counters command
reset_global_parser = subparsers.add_parser('reset-global-counters', help='Reset global counters (e.g., failed_lock_attempts).', parents=[common_parser]) reset_global_parser = subparsers.add_parser('reset-global-counters', help='Reset global counters (e.g., failed_lock_attempts).', parents=[common_parser])
@ -1675,6 +1733,19 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
return return
profiles = manager.list_profiles(args.state, args.proxy) profiles = manager.list_profiles(args.state, args.proxy)
if getattr(args, 'hide_ungrouped', False):
all_grouped_profile_names = set()
for group in profile_groups_config:
for p_name in group.get('profiles_in_group', []):
all_grouped_profile_names.add(p_name)
original_count = len(profiles)
profiles = [p for p in profiles if p.get('name') in all_grouped_profile_names]
filtered_count = original_count - len(profiles)
if filtered_count > 0 and not args.live:
print(f"NOTE: {filtered_count} ungrouped profiles were hidden via --hide-ungrouped.", file=sys.stderr)
if not profiles: if not profiles:
print("No profiles found matching the criteria.", file=file) print("No profiles found matching the criteria.", file=file)
return return
@ -1832,10 +1903,13 @@ def _render_simulation_view(title, manager, args, file=sys.stdout):
profile_groups_config = _build_profile_groups_config(manager, profiles) profile_groups_config = _build_profile_groups_config(manager, profiles)
# The group summary table is only relevant for the Auth simulation, which has
# selection strategies and rotation policies.
is_auth_sim = 'Auth' in title
if is_auth_sim:
profile_selection_strategy = manager.get_config('profile_selection_strategy') profile_selection_strategy = manager.get_config('profile_selection_strategy')
if profile_selection_strategy: if profile_selection_strategy:
print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file) print(f"Profile Selection Strategy: {profile_selection_strategy}", file=file)
_render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file) _render_profile_group_summary_table(manager, profiles, profile_groups_config, args, file=file)
failed_lock_attempts = manager.get_failed_lock_attempts() failed_lock_attempts = manager.get_failed_lock_attempts()
@ -1997,7 +2071,6 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
_render_activation_history_table(auth_manager, file=file) _render_activation_history_table(auth_manager, file=file)
print(f"\n--- Download Simulation Profile Details ({args.download_env}) ---", file=file) print(f"\n--- Download Simulation Profile Details ({args.download_env}) ---", file=file)
_render_profile_group_summary_table(download_manager, dl_profiles, dl_groups_config, args, file=file)
_render_profile_details_table(download_manager, args, "Download", dl_groups_config, file=file) _render_profile_details_table(download_manager, args, "Download", dl_groups_config, file=file)
if args.show_activation_history: if args.show_activation_history:
_render_activation_history_table(download_manager, file=file) _render_activation_history_table(download_manager, file=file)
@ -2010,6 +2083,8 @@ def _print_profile_list(manager, args, title="Profile Status"):
return _render_simulation_view(title, manager, args, file=sys.stdout) return _render_simulation_view(title, manager, args, file=sys.stdout)
def main_profile_manager(args): def main_profile_manager(args):
"""Main dispatcher for 'profile' command.""" """Main dispatcher for 'profile' command."""
if load_dotenv: if load_dotenv:
@ -2026,6 +2101,13 @@ def main_profile_manager(args):
print(f"ERROR: The specified --env-file was not found: {args.env_file}", file=sys.stderr) print(f"ERROR: The specified --env-file was not found: {args.env_file}", file=sys.stderr)
return 1 return 1
# After loading .env, populate any args that were not provided on the CLI
# This is necessary because argparse `default=os.getenv(...)` runs before `load_dotenv`.
if hasattr(args, 'auth_env') and args.auth_env is None:
args.auth_env = os.getenv('YTOPS_AUTH_ENV')
if hasattr(args, 'download_env') and args.download_env is None:
args.download_env = os.getenv('YTOPS_DOWNLOAD_ENV')
if args.redis_host is None: if args.redis_host is None:
args.redis_host = os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost')) args.redis_host = os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost'))
if args.redis_port is None: if args.redis_port is None:
@ -2190,8 +2272,46 @@ def main_profile_manager(args):
sys.stdout.flush() sys.stdout.flush()
elif args.profile_command == 'set-proxy-state': elif args.profile_command == 'set-proxy-state':
success = manager.set_proxy_state(args.proxy_url, args.state, args.duration_minutes) proxy_urls = [p.strip() for p in args.proxy_urls.split(',') if p.strip()]
return 0 if success else 1 if not proxy_urls:
print("Error: No proxy URLs provided.", file=sys.stderr)
return 1
envs_to_process = []
if args.auth_env:
envs_to_process.append(args.auth_env)
if args.download_env:
envs_to_process.append(args.download_env)
if envs_to_process:
# If --auth-env or --download-env are used, operate on them.
all_success = True
for env_name in set(envs_to_process):
# When operating on specific envs, derive prefix from env name, ignoring --legacy and --key-prefix.
# This aligns with the behavior of the 'list' command in dual-mode.
key_prefix_for_env = f"{env_name}_profile_mgmt_"
print(f"--- Setting proxy state for environment: {env_name} (prefix: {key_prefix_for_env}) ---", file=sys.stderr)
env_manager = ProfileManager(
redis_host=args.redis_host,
redis_port=args.redis_port,
redis_password=args.redis_password,
key_prefix=key_prefix_for_env,
redis_db=args.redis_db
)
for proxy_url in proxy_urls:
success = env_manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason)
if not success:
all_success = False
return 0 if all_success else 1
else:
# Fallback to the single manager created with --env, --legacy, or --key-prefix.
# This maintains backward compatibility and handles single-environment cases.
all_success = True
for proxy_url in proxy_urls:
success = manager.set_proxy_state(proxy_url, args.state, args.duration_minutes, args.reason)
if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'get': elif args.profile_command == 'get':
profile = manager.get_profile(args.name) profile = manager.get_profile(args.name)
@ -2226,31 +2346,211 @@ def main_profile_manager(args):
return 0 return 0
elif args.profile_command == 'update-state': elif args.profile_command == 'update-state':
success = manager.update_profile_state(args.name, args.state, args.reason or '') names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
return 0 if success else 1 if not names_or_patterns:
print("Error: No profile names or patterns provided.", file=sys.stderr)
return 1
all_profiles = manager.list_profiles()
all_profile_names = {p['name'] for p in all_profiles}
profiles_to_update = set()
for item in names_or_patterns:
if '*' in item or '?' in item:
matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
if not matched_profiles:
print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
profiles_to_update.update(matched_profiles)
else:
if item in all_profile_names:
profiles_to_update.add(item)
else:
print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
if not profiles_to_update:
print("No matching profiles found to update.", file=sys.stderr)
return 1
print(f"The following {len(profiles_to_update)} profiles will be updated to state '{args.state}':")
print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
confirm = input("Are you sure you want to proceed? (y/N): ")
if confirm.lower() != 'y':
print("Aborted.")
return 1
all_success = True
for name in sorted(list(profiles_to_update), key=natural_sort_key):
success = manager.update_profile_state(name, args.state, args.reason or '')
if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'update-field': elif args.profile_command == 'update-field':
success = manager.update_profile_field(args.name, args.field, args.value) success = manager.update_profile_field(args.name, args.field, args.value)
return 0 if success else 1 return 0 if success else 1
elif args.profile_command == 'pause': elif args.profile_command == 'pause':
success = manager.update_profile_state(args.name, ProfileState.PAUSED.value, 'Manual pause') names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
return 0 if success else 1 if not names_or_patterns:
print("Error: No profile names or patterns provided.", file=sys.stderr)
return 1
all_profiles = manager.list_profiles()
all_profile_names = {p['name'] for p in all_profiles}
profiles_to_update = set()
for item in names_or_patterns:
if '*' in item or '?' in item:
matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
if not matched_profiles:
print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
profiles_to_update.update(matched_profiles)
else:
if item in all_profile_names:
profiles_to_update.add(item)
else:
print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
if not profiles_to_update:
print("No matching profiles found to update.", file=sys.stderr)
return 1
print(f"The following {len(profiles_to_update)} profiles will be PAUSED:")
print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
confirm = input("Are you sure you want to proceed? (y/N): ")
if confirm.lower() != 'y':
print("Aborted.")
return 1
all_success = True
for name in sorted(list(profiles_to_update), key=natural_sort_key):
success = manager.update_profile_state(name, ProfileState.PAUSED.value, 'Manual pause')
if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'activate': elif args.profile_command == 'activate':
success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual activation') names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
return 0 if success else 1 if not names_or_patterns:
print("Error: No profile names or patterns provided.", file=sys.stderr)
return 1
all_profiles = manager.list_profiles()
all_profile_names = {p['name'] for p in all_profiles}
profiles_to_update = set()
for item in names_or_patterns:
if '*' in item or '?' in item:
matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
if not matched_profiles:
print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
profiles_to_update.update(matched_profiles)
else:
if item in all_profile_names:
profiles_to_update.add(item)
else:
print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
if not profiles_to_update:
print("No matching profiles found to update.", file=sys.stderr)
return 1
print(f"The following {len(profiles_to_update)} profiles will be ACTIVATED:")
print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
confirm = input("Are you sure you want to proceed? (y/N): ")
if confirm.lower() != 'y':
print("Aborted.")
return 1
all_success = True
for name in sorted(list(profiles_to_update), key=natural_sort_key):
success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual activation')
if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'ban': elif args.profile_command == 'ban':
success = manager.update_profile_state(args.name, ProfileState.BANNED.value, args.reason) names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
return 0 if success else 1 if not names_or_patterns:
print("Error: No profile names or patterns provided.", file=sys.stderr)
return 1
all_profiles = manager.list_profiles()
all_profile_names = {p['name'] for p in all_profiles}
profiles_to_update = set()
for item in names_or_patterns:
if '*' in item or '?' in item:
matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
if not matched_profiles:
print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
profiles_to_update.update(matched_profiles)
else:
if item in all_profile_names:
profiles_to_update.add(item)
else:
print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
if not profiles_to_update:
print("No matching profiles found to update.", file=sys.stderr)
return 1
print(f"The following {len(profiles_to_update)} profiles will be BANNED:")
print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
confirm = input("Are you sure you want to proceed? (y/N): ")
if confirm.lower() != 'y':
print("Aborted.")
return 1
all_success = True
for name in sorted(list(profiles_to_update), key=natural_sort_key):
success = manager.update_profile_state(name, ProfileState.BANNED.value, args.reason)
if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'unban': elif args.profile_command == 'unban':
names_or_patterns = [n.strip() for n in args.names.split(',') if n.strip()]
if not names_or_patterns:
print("Error: No profile names or patterns provided.", file=sys.stderr)
return 1
all_profiles = manager.list_profiles()
all_profile_names = {p['name'] for p in all_profiles}
profiles_to_update = set()
for item in names_or_patterns:
if '*' in item or '?' in item:
matched_profiles = {name for name in all_profile_names if fnmatch.fnmatch(name, item)}
if not matched_profiles:
print(f"Warning: Pattern '{item}' did not match any profiles.", file=sys.stderr)
profiles_to_update.update(matched_profiles)
else:
if item in all_profile_names:
profiles_to_update.add(item)
else:
print(f"Warning: Profile '{item}' not found.", file=sys.stderr)
if not profiles_to_update:
print("No matching profiles found to update.", file=sys.stderr)
return 1
print(f"The following {len(profiles_to_update)} profiles will be UNBANNED:")
print(", ".join(sorted(list(profiles_to_update), key=natural_sort_key)))
confirm = input("Are you sure you want to proceed? (y/N): ")
if confirm.lower() != 'y':
print("Aborted.")
return 1
all_success = True
for name in sorted(list(profiles_to_update), key=natural_sort_key):
# First activate, then reset session counters. The ban reason is cleared by update_profile_state. # First activate, then reset session counters. The ban reason is cleared by update_profile_state.
success = manager.update_profile_state(args.name, ProfileState.ACTIVE.value, 'Manual unban') success = manager.update_profile_state(name, ProfileState.ACTIVE.value, 'Manual unban')
if success: if success:
manager.reset_profile_counters(args.name) manager.reset_profile_counters(name)
return 0 if success else 1 if not success:
all_success = False
return 0 if all_success else 1
elif args.profile_command == 'delete': elif args.profile_command == 'delete':
if not args.confirm: if not args.confirm:
@ -2267,6 +2567,9 @@ def main_profile_manager(args):
print(f"Deleted {deleted_count} key(s) with prefix '{manager.key_prefix}'.") print(f"Deleted {deleted_count} key(s) with prefix '{manager.key_prefix}'.")
return 0 return 0
elif args.profile_command == 'cleanup-ungrouped':
return _main_cleanup_ungrouped(args)
elif args.profile_command == 'reset-global-counters': elif args.profile_command == 'reset-global-counters':
manager.reset_global_counters() manager.reset_global_counters()
return 0 return 0
@ -2310,3 +2613,286 @@ def main_profile_manager(args):
return 0 return 0
return 1 # Should not be reached return 1 # Should not be reached
def _main_cleanup_ungrouped(args):
"""Handler for the 'cleanup-ungrouped' command."""
try:
with open(args.policy_file, 'r', encoding='utf-8') as f:
policy = yaml.safe_load(f)
except (IOError, yaml.YAMLError) as e:
print(f"Error: Could not read or parse policy file '{args.policy_file}': {e}", file=sys.stderr)
return 1
common_pools = policy.get('common_pools', [])
if not common_pools:
print(f"Warning: No 'common_pools' found in '{args.policy_file}'. Nothing to do.", file=sys.stderr)
return 0
desired_profiles = {} # name -> proxy
for pool in common_pools:
prefixes = pool.get('prefixes', [])
count = pool.get('count', 0)
proxy = pool.get('proxy')
if not proxy:
print(f"Error: Pool with prefixes {prefixes} is missing a 'proxy' definition.", file=sys.stderr)
return 1
for prefix in prefixes:
for i in range(count):
name = f"{prefix}_{i}"
desired_profiles[name] = proxy
desired_profile_names = set(desired_profiles.keys())
print(f"Loaded setup policy. Found {len(desired_profile_names)} desired profiles across {len(common_pools)} pools.")
total_deleted = 0
total_skipped = 0
total_created = 0
total_updated = 0
# --- Paired Cleanup Mode ---
if args.auth_env and args.download_env:
print("\n--- Running in Paired Cleanup Mode ---")
if not args.disallow_cleanup_active_downloads:
print("Cleanup of active download profiles is ENABLED (default). Auth profiles will be checked for idleness.")
print("If an auth profile is idle, its corresponding download profile will be removed regardless of its state.")
else:
print("Cleanup of active download profiles is DISABLED. Both auth and download profiles must be idle to be removed.")
if args.dry_run:
print("--- DRY RUN MODE: No changes will be made. ---")
def _create_cleanup_manager(env_name):
key_prefix = f"{env_name}_profile_mgmt_"
if args.legacy: key_prefix = 'profile_mgmt_'
if args.key_prefix: key_prefix = args.key_prefix
return ProfileManager(
redis_host=args.redis_host, redis_port=args.redis_port,
redis_password=args.redis_password, key_prefix=key_prefix,
redis_db=args.redis_db
)
auth_manager = _create_cleanup_manager(args.auth_env)
download_manager = _create_cleanup_manager(args.download_env)
auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()}
download_profiles_map = {p['name']: p for p in download_manager.list_profiles()}
# --- Create Missing Profiles ---
auth_missing = desired_profile_names - set(auth_profiles_map.keys())
download_missing = desired_profile_names - set(download_profiles_map.keys())
if auth_missing or download_missing:
if args.no_create_missing:
if auth_missing: print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}', creation disabled via --no-create-missing.")
if download_missing: print(f"Found {len(download_missing)} missing profiles in '{args.download_env}', creation disabled via --no-create-missing.")
else:
if auth_missing:
print(f"Found {len(auth_missing)} missing profiles in '{args.auth_env}'. Creating them...")
for name in sorted(list(auth_missing), key=natural_sort_key):
proxy = desired_profiles.get(name)
print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.auth_env}'")
if not args.dry_run:
if auth_manager.create_profile(name, proxy): total_created += 1
if download_missing:
print(f"Found {len(download_missing)} missing profiles in '{args.download_env}'. Creating them...")
for name in sorted(list(download_missing), key=natural_sort_key):
proxy = desired_profiles.get(name)
print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}' in '{args.download_env}'")
if not args.dry_run:
if download_manager.create_profile(name, proxy): total_created += 1
# Refresh maps after creation
auth_profiles_map = {p['name']: p for p in auth_manager.list_profiles()}
download_profiles_map = {p['name']: p for p in download_manager.list_profiles()}
# --- Update Proxies for Existing Profiles ---
auth_existing_policy_names = set(auth_profiles_map.keys()) & desired_profile_names
if auth_existing_policy_names:
print(f"\nChecking for proxy updates in '{args.auth_env}'...")
for name in sorted(list(auth_existing_policy_names), key=natural_sort_key):
current_proxy = auth_profiles_map[name].get('proxy')
desired_proxy = desired_profiles.get(name)
if current_proxy != desired_proxy and desired_proxy:
print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
if not args.dry_run:
if auth_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1
download_existing_policy_names = set(download_profiles_map.keys()) & desired_profile_names
if download_existing_policy_names:
print(f"\nChecking for proxy updates in '{args.download_env}'...")
for name in sorted(list(download_existing_policy_names), key=natural_sort_key):
current_proxy = download_profiles_map[name].get('proxy')
desired_proxy = desired_profiles.get(name)
if current_proxy != desired_proxy and desired_proxy:
print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
if not args.dry_run:
if download_manager.update_profile_field(name, 'proxy', desired_proxy): total_updated += 1
# --- Delete Ungrouped Profiles ---
auth_ungrouped = set(auth_profiles_map.keys()) - desired_profile_names
download_ungrouped = set(download_profiles_map.keys()) - desired_profile_names
all_ungrouped = sorted(list(auth_ungrouped | download_ungrouped), key=natural_sort_key)
if not all_ungrouped:
print("\nNo ungrouped profiles found in either environment to clean up.")
else:
print(f"\nFound {len(all_ungrouped)} ungrouped profile(s) across both environments to consider for deletion.")
for name in all_ungrouped:
auth_profile = auth_profiles_map.get(name)
download_profile = download_profiles_map.get(name)
can_delete_auth, auth_skip_reasons = False, []
if auth_profile:
state = auth_profile.get('state', 'UNKNOWN')
pending_dls = auth_manager.get_pending_downloads(name)
rest_reason = auth_profile.get('rest_reason')
is_safe = True
if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
is_safe = False; auth_skip_reasons.append(f"auth is '{state}'")
if pending_dls > 0:
is_safe = False; auth_skip_reasons.append(f"auth has {pending_dls} pending DLs")
if rest_reason == 'waiting_downloads':
is_safe = False; auth_skip_reasons.append("auth is 'waiting_downloads'")
if is_safe: can_delete_auth = True
can_delete_download, download_skip_reasons = False, []
if download_profile:
state = download_profile.get('state', 'UNKNOWN')
is_safe = state not in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]
if not is_safe: download_skip_reasons.append(f"download is '{state}'")
if is_safe: can_delete_download = True
if auth_profile and not can_delete_auth:
print(f" - SKIPPING '{name}' because its auth profile is busy ({', '.join(auth_skip_reasons)}).")
total_skipped += 1
continue
was_download_busy = download_profile and not can_delete_download
if auth_profile and can_delete_auth and not args.disallow_cleanup_active_downloads:
can_delete_download = True # The override rule
if (auth_profile and can_delete_auth) or (download_profile and can_delete_download):
msgs = []
if auth_profile and can_delete_auth: msgs.append(f"Auth (State: {auth_profile.get('state', 'N/A')})")
if download_profile and can_delete_download:
dl_msg = f"Download (State: {download_profile.get('state', 'N/A')})"
if was_download_busy: dl_msg += " <-- DELETING ACTIVE"
msgs.append(dl_msg)
print(f" - Deleting '{name}': {'; '.join(msgs)}")
if not args.dry_run:
if auth_profile and can_delete_auth: auth_manager.delete_profile(name)
if download_profile and can_delete_download: download_manager.delete_profile(name)
total_deleted += 1
else:
print(f" - SKIPPING '{name}' because its download profile is busy ({', '.join(download_skip_reasons)}) and no idle auth profile exists to override.")
total_skipped += 1
# --- Single Environment Cleanup Mode ---
else:
if args.dry_run:
print("--- DRY RUN MODE: No changes will be made. ---")
envs_to_clean = []
if args.auth_env: envs_to_clean.append(args.auth_env)
if args.download_env: envs_to_clean.append(args.download_env)
if not envs_to_clean:
if args.env: envs_to_clean.append(args.env)
else:
print("Error: You must specify at least one environment to clean up (e.g., --auth-env sim_auth, --download-env sim_download, or --env dev).", file=sys.stderr)
return 1
for env_name in envs_to_clean:
print(f"\n--- Cleaning environment: {env_name} ---")
key_prefix = f"{env_name}_profile_mgmt_"
if args.legacy: key_prefix = 'profile_mgmt_'
if args.key_prefix: key_prefix = args.key_prefix
manager = ProfileManager(
redis_host=args.redis_host, redis_port=args.redis_port,
redis_password=args.redis_password, key_prefix=key_prefix,
redis_db=args.redis_db
)
current_profiles_list = manager.list_profiles()
current_profiles_map = {p['name']: p for p in current_profiles_list}
current_profile_names = set(current_profiles_map.keys())
ungrouped_names = current_profile_names - desired_profile_names
missing_names = desired_profile_names - current_profile_names
existing_policy_names = current_profile_names & desired_profile_names
# --- Update Proxies for Existing Profiles ---
profiles_updated_in_env = 0
if existing_policy_names:
print("Checking for proxy updates for existing profiles...")
for name in sorted(list(existing_policy_names), key=natural_sort_key):
current_proxy = current_profiles_map[name].get('proxy')
desired_proxy = desired_profiles.get(name)
if current_proxy != desired_proxy and desired_proxy:
print(f" - {'[DRY RUN] Would update' if args.dry_run else 'Updating'} proxy for profile '{name}': from '{current_proxy}' to '{desired_proxy}'")
if not args.dry_run:
if manager.update_profile_field(name, 'proxy', desired_proxy):
profiles_updated_in_env += 1
total_updated += profiles_updated_in_env
profiles_created_in_env = 0
if missing_names:
print(f"Found {len(missing_names)} profile(s) defined in the policy that do not exist in Redis.")
if args.no_create_missing:
print("Creation of missing profiles is disabled via --no-create-missing.")
else:
if not args.dry_run:
print("Creating missing profiles...")
for name in sorted(list(missing_names), key=natural_sort_key):
proxy = desired_profiles.get(name)
print(f" - {'[DRY RUN] Would create' if args.dry_run else 'Creating'} profile '{name}' with proxy '{proxy}'")
if not args.dry_run:
if proxy:
if manager.create_profile(name, proxy):
profiles_created_in_env += 1
else:
# This should not happen due to the check at the start
print(f" - SKIPPING '{name}' because its proxy could not be determined from the policy.", file=sys.stderr)
total_created += profiles_created_in_env
if not ungrouped_names:
print("No ungrouped profiles found to clean up.")
if profiles_created_in_env == 0:
continue
print(f"Found {len(ungrouped_names)} ungrouped profile(s) to consider for deletion.")
profiles_to_check = [p for p in current_profiles_list if p['name'] in ungrouped_names]
for profile in sorted(profiles_to_check, key=lambda p: natural_sort_key(p['name'])):
name = profile['name']
state = profile.get('state', 'UNKNOWN')
pending_dls = manager.get_pending_downloads(name)
is_safe, reasons = True, []
if state in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
is_safe = False; reasons.append(f"is in '{state}' state")
if pending_dls > 0:
is_safe = False; reasons.append(f"has {pending_dls} pending download(s)")
if profile.get('rest_reason') == 'waiting_downloads':
is_safe = False; reasons.append("is in 'waiting_downloads' state")
if is_safe:
print(f" - Deleting '{name}' (State: {state}, Pending DLs: {pending_dls})")
if not args.dry_run: manager.delete_profile(name)
total_deleted += 1
else:
print(f" - SKIPPING '{name}' because it {', '.join(reasons)}.")
total_skipped += 1
print("\n--- Cleanup Summary ---")
print(f"Total profiles created: {total_created}")
print(f"Total profiles updated: {total_updated}")
print(f"Total profiles deleted: {total_deleted}")
print(f"Total profiles skipped (still active or has pending work): {total_skipped}")
if total_skipped > 0:
print("Run the cleanup command again later to remove the skipped profiles once they are idle.")
return 0

View File

@ -4,6 +4,7 @@ Redis Queue Management CLI Tool for yt-ops-client.
""" """
import argparse import argparse
import hashlib
import json import json
import logging import logging
import os import os
@ -23,6 +24,15 @@ except ImportError:
print("'tabulate' library not found. Please install it with: pip install tabulate", file=sys.stderr) print("'tabulate' library not found. Please install it with: pip install tabulate", file=sys.stderr)
tabulate = None tabulate = None
try:
import yaml
except ImportError:
print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
yaml = None
from pathlib import Path
import fnmatch
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@ -31,9 +41,53 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _find_configured_queues(policies_dir="policies", env=None):
"""Scans YAML files in a directory to find configured queue names."""
if not yaml:
return set()
expected_queues = set()
policies_path = Path(policies_dir)
if not policies_path.is_dir():
logger.debug(f"Policies directory '{policies_dir}' not found, cannot find expected queues.")
return set()
for policy_file in policies_path.glob("*.yaml"):
try:
with open(policy_file, 'r', encoding='utf-8') as f:
policy_data = yaml.safe_load(f)
if not isinstance(policy_data, dict):
continue
queue_policy = policy_data.get('queue_policy')
if not isinstance(queue_policy, dict):
continue
use_prefix = queue_policy.get('use_env_prefix', True)
prefix = ""
if use_prefix and env:
prefix = f"{env}_"
for key, value in queue_policy.items():
if key.endswith('_queue') and isinstance(value, str):
expected_queues.add(f"{prefix}{value}")
except (IOError, yaml.YAMLError) as e:
logger.debug(f"Could not parse policy {policy_file} to find queues: {e}")
continue
return expected_queues
class QueueManager: class QueueManager:
"""Manages Redis lists (queues).""" """Manages Redis lists (queues)."""
def _push_state_key(self, queue_name: str, file_path: str) -> str:
"""Get Redis key for storing the last pushed index for a given queue and file."""
# Use a hash of the absolute file path to create a consistent, safe key.
abs_path = os.path.abspath(file_path)
path_hash = hashlib.sha256(abs_path.encode()).hexdigest()
return f"ytops_client:queue_push_state:{queue_name}:{path_hash}"
def __init__(self, redis_host='localhost', redis_port=6379, redis_password=None): def __init__(self, redis_host='localhost', redis_port=6379, redis_password=None):
"""Initialize Redis connection.""" """Initialize Redis connection."""
logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port}...") logger.info(f"Attempting to connect to Redis at {redis_host}:{redis_port}...")
@ -70,10 +124,29 @@ class QueueManager:
"""Returns the number of items in a queue.""" """Returns the number of items in a queue."""
return self.redis.llen(queue_name) return self.redis.llen(queue_name)
def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int: def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None, limit: Optional[int] = None, start_index: Optional[int] = None, auto_shift: bool = False) -> int:
"""Populates a queue from a file (text with one item per line, or JSON with an array of items).""" """Populates a queue from a file (text with one item per line, or JSON with an array of items)."""
count = 0 count = 0
# --- State management for file position ---
state_key = None
current_start_index = 0 # 0-based index
if auto_shift:
state_key = self._push_state_key(queue_name, file_path)
last_index_str = self.redis.get(state_key)
if last_index_str:
current_start_index = int(last_index_str)
logger.info(f"Auto-shift enabled. Resuming from line {current_start_index + 1}.")
elif start_index is not None:
# CLI provides 1-based index, convert to 0-based.
current_start_index = max(0, start_index - 1)
logger.info(f"Starting from line {current_start_index + 1} as requested.")
# ---
items_to_add = []
total_items_in_file = 0
if file_path.lower().endswith('.json'): if file_path.lower().endswith('.json'):
if wrap_key: if wrap_key:
logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.") logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.")
@ -85,25 +158,23 @@ class QueueManager:
logger.error("JSON file must contain a list/array.") logger.error("JSON file must contain a list/array.")
return 0 return 0
total_items_in_file = len(data)
if current_start_index >= total_items_in_file:
logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} items). Nothing to push.")
return 0
items_to_process = data[current_start_index:]
if limit is not None and limit >= 0:
items_to_process = items_to_process[:limit]
# Items can be strings or objects. If objects, they should be converted to JSON strings. # Items can be strings or objects. If objects, they should be converted to JSON strings.
items_to_add = [] for item in items_to_process:
for item in data:
if isinstance(item, str): if isinstance(item, str):
items_to_add.append(item.strip()) items_to_add.append(item.strip())
else: else:
items_to_add.append(json.dumps(item)) items_to_add.append(json.dumps(item))
items_to_add = [item for item in items_to_add if item] items_to_add = [item for item in items_to_add if item]
pipe = self.redis.pipeline()
for item in items_to_add:
pipe.rpush(queue_name, item)
count += 1
if count > 0 and count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {count} items...")
pipe.execute()
except (IOError, json.JSONDecodeError) as e: except (IOError, json.JSONDecodeError) as e:
logger.error(f"Failed to read or parse JSON file '{file_path}': {e}") logger.error(f"Failed to read or parse JSON file '{file_path}': {e}")
return 0 return 0
@ -111,24 +182,51 @@ class QueueManager:
logger.info("Reading items from text file (one per line).") logger.info("Reading items from text file (one per line).")
try: try:
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, 'r', encoding='utf-8') as f:
pipe = self.redis.pipeline() all_lines = f.readlines()
for line in f:
total_items_in_file = len(all_lines)
if current_start_index >= total_items_in_file:
logger.info(f"Start index ({current_start_index + 1}) is past the end of the file ({total_items_in_file} lines). Nothing to push.")
return 0
lines_to_process = all_lines[current_start_index:]
if limit is not None and limit >= 0:
lines_to_process = lines_to_process[:limit]
for line in lines_to_process:
item = line.strip() item = line.strip()
if item: if item:
if wrap_key: if wrap_key:
payload = json.dumps({wrap_key: item}) payload = json.dumps({wrap_key: item})
else: else:
payload = item payload = item
pipe.rpush(queue_name, payload) items_to_add.append(payload)
count += 1
if count > 0 and count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {count} items...")
pipe.execute()
except IOError as e: except IOError as e:
logger.error(f"Failed to read file '{file_path}': {e}") logger.error(f"Failed to read file '{file_path}': {e}")
return 0 return 0
if items_to_add:
pipe = self.redis.pipeline()
for item in items_to_add:
pipe.rpush(queue_name, item)
count += 1
if count > 0 and count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {count} of {len(items_to_add)} items...")
pipe.execute()
if auto_shift and state_key:
new_index = current_start_index + count
# Don't save a new index if we've reached the end of the file.
# This allows re-running the command to start from the beginning again.
if new_index >= total_items_in_file:
self.redis.delete(state_key)
logger.info(f"Auto-shift: Reached end of file. Cleared saved position for '{os.path.basename(file_path)}'. Next run will start from the beginning.")
else:
self.redis.set(state_key, new_index)
logger.info(f"Auto-shift: Saved next start position for '{os.path.basename(file_path)}' as line {new_index + 1}.")
logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.") logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.")
return count return count
@ -230,7 +328,11 @@ def add_queue_manager_parser(subparsers):
# Push command # Push command
push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser]) push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser])
push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.") push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).') push_parser.add_argument('--count', type=int, default=None, help='Number of items to push. For --from-file, limits the number of lines pushed. For other sources, specifies how many items to generate/push (defaults to 1).')
shift_group = push_parser.add_mutually_exclusive_group()
shift_group.add_argument('--start', type=int, help='For --from-file, start pushing from this line number (1-based).')
shift_group.add_argument('--auto-shift', action='store_true', help="For --from-file, automatically resume from where the last push left off. State is stored in Redis.")
source_group = push_parser.add_mutually_exclusive_group(required=True) source_group = push_parser.add_mutually_exclusive_group(required=True)
source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).') source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).')
@ -285,10 +387,26 @@ def main_queue_manager(args):
print(f"INFO: No queue name specified, defaulting to '{default_queue_name}' based on --env='{args.env}'.", file=sys.stderr) print(f"INFO: No queue name specified, defaulting to '{default_queue_name}' based on --env='{args.env}'.", file=sys.stderr)
if args.queue_command == 'list': if args.queue_command == 'list':
queues = manager.list_queues(args.pattern) queues_from_redis = manager.list_queues(args.pattern)
# Discover queues from policy files
expected_queues_from_policies = _find_configured_queues(env=args.env)
# Merge Redis results with policy-defined queues
all_queues_map = {q['name']: q for q in queues_from_redis}
for q_name in expected_queues_from_policies:
if q_name not in all_queues_map:
# Only add if it matches the pattern filter
if fnmatch.fnmatch(q_name, args.pattern):
all_queues_map[q_name] = {'name': q_name, 'size': 0}
queues = sorted(list(all_queues_map.values()), key=lambda x: x['name'])
if not queues: if not queues:
print(f"No queues found matching pattern '{args.pattern}'.") print(f"No queues found matching pattern '{args.pattern}'.")
return 0 return 0
if tabulate: if tabulate:
print(tabulate(queues, headers='keys', tablefmt='grid')) print(tabulate(queues, headers='keys', tablefmt='grid'))
else: else:
@ -309,16 +427,23 @@ def main_queue_manager(args):
if not os.path.exists(args.file_path): if not os.path.exists(args.file_path):
print(f"Error: File not found at '{args.file_path}'", file=sys.stderr) print(f"Error: File not found at '{args.file_path}'", file=sys.stderr)
return 1 return 1
if args.count > 1: manager.push_from_file(
logger.warning("--count is ignored when using --from-file.") args.queue_name,
manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json) args.file_path,
args.wrap_file_line_in_json,
limit=args.count,
start_index=args.start,
auto_shift=args.auto_shift
)
elif args.payload_json: elif args.payload_json:
manager.push_static(args.queue_name, args.payload_json, args.count) count = args.count if args.count is not None else 1
manager.push_static(args.queue_name, args.payload_json, count)
elif args.generate_payload_prefix: elif args.generate_payload_prefix:
if args.count <= 0: count = args.count if args.count is not None else 1
if count <= 0:
print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr) print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr)
return 1 return 1
manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count) manager.push_generated(args.queue_name, args.generate_payload_prefix, count)
return 0 return 0
elif args.queue_command == 'clear': elif args.queue_command == 'clear':

View File

@ -167,6 +167,7 @@ Overridable Policy Parameters via --set:
parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.')
parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.')
parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)")
parser.add_argument('--workers', type=int, help='Shortcut to override the total number of workers, capping any discovery logic.')
parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.") parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.")
parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.') parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.')
parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.") parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.")

View File

@ -952,23 +952,41 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
failure_rate = dummy_settings.get('download_failure_rate', 0.0) failure_rate = dummy_settings.get('download_failure_rate', 0.0)
skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0)
# In dummy mode, prioritize the format from the task file, then from the policy. # In dummy mode, prioritize the format from the task metadata.
format_selection = info_data.get('_ytops_download_format') formats_to_test = None
source_of_format = "task file" source_of_format = "unknown"
if not format_selection:
format_selection = d_policy.get('formats', '') # Prioritize format from task metadata, which supports per-format and per-url tasks.
metadata = info_data.get('_ytops_metadata', {})
formats_requested = metadata.get('formats_requested')
if formats_requested is not None:
formats_to_test = formats_requested
source_of_format = "task file metadata (_ytops_metadata.formats_requested)"
if formats_to_test is None:
# Fallback for older task formats or different workflows
format_selection_str = info_data.get('_ytops_download_format')
if format_selection_str:
source_of_format = "task file (_ytops_download_format)"
formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
if formats_to_test is None:
format_selection_str = d_policy.get('formats', '')
if format_selection_str:
source_of_format = "policy (download_policy.formats)" source_of_format = "policy (download_policy.formats)"
formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
if not format_selection: if formats_to_test is None:
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
format_selection = ytdlp_config_overrides.get('format', '') format_selection_str = ytdlp_config_overrides.get('format', '')
if format_selection_str:
source_of_format = "policy (ytdlp_config_overrides.format)" source_of_format = "policy (ytdlp_config_overrides.format)"
formats_to_test = [f.strip() for f in format_selection_str.split(',') if f.strip()]
if not format_selection: if formats_to_test is None:
logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.")
formats_to_test = ['dummy_format'] formats_to_test = ['dummy_format']
else: else:
formats_to_test = [f.strip() for f in format_selection.split(',') if f.strip()]
logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}") logger.info(f"[Worker {worker_id}] DUMMY: Simulating downloads for formats (from {source_of_format}): {', '.join(formats_to_test)}")
for format_id in formats_to_test: for format_id in formats_to_test:
@ -1022,14 +1040,47 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========") logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========")
# In dummy mode, we just rename the file to processed and continue to the finally block. # --- Airflow Directory Logic (Dummy Mode) ---
success = downloads_processed_in_task > 0
if success and d_policy.get('output_to_airflow_ready_dir'):
try: try:
video_id = info_data.get('id')
if not video_id:
logger.error(f"[{profile_name}] DUMMY: Could not find video ID in '{claimed_task_path_host.name}' for moving files.")
else:
# --- Prepare destination directory ---
now = datetime.now()
rounded_minute = (now.minute // 10) * 10
timestamp_str = now.strftime('%Y%m%dT%H') + f"{rounded_minute:02d}"
base_path = d_policy.get('airflow_ready_dir_base_path', 'downloadfiles/videos/ready')
if not os.path.isabs(base_path):
base_path = os.path.join(sp_utils._PROJECT_ROOT, base_path)
final_dir_base = os.path.join(base_path, timestamp_str)
final_dir_path = os.path.join(final_dir_base, video_id)
os.makedirs(final_dir_path, exist_ok=True)
# --- Copy info.json ---
new_info_json_name = f"info_{video_id}.json"
dest_info_json_path = os.path.join(final_dir_path, new_info_json_name)
if not os.path.exists(dest_info_json_path):
shutil.copy(str(claimed_task_path_host), dest_info_json_path)
logger.info(f"[{profile_name}] DUMMY: Copied info.json to {dest_info_json_path}")
except Exception as e:
logger.error(f"[{profile_name}] DUMMY: Failed during post-download processing for Airflow: {e}", exc_info=True)
# In dummy mode, we handle file cleanup and continue to the finally block.
try:
if d_policy.get('remove_source_info_json'):
claimed_task_path_host.unlink()
logger.debug(f"DUMMY MODE: Removed processed task file '{claimed_task_path_host.name}'.")
else:
base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0]
processed_path = Path(f"{base_path_str}.processed") processed_path = Path(f"{base_path_str}.processed")
claimed_task_path_host.rename(processed_path) claimed_task_path_host.rename(processed_path)
logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.") logger.debug(f"DUMMY MODE: Renamed processed task file to '{processed_path.name}'.")
except (OSError, IndexError) as e: except (OSError, IndexError) as e:
logger.error(f"DUMMY MODE: Failed to rename processed task file '{claimed_task_path_host}': {e}") logger.error(f"DUMMY MODE: Failed to clean up processed task file '{claimed_task_path_host}': {e}")
continue # Skip to finally block continue # Skip to finally block
@ -1329,15 +1380,19 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
# 6. Clean up task file # 6. Clean up task file
if not queue_policy: if not queue_policy:
# File-based mode: rename to .processed # File-based mode: rename to .processed or remove
try: try:
if success and d_policy.get('remove_source_info_json'):
claimed_task_path_host.unlink()
logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Removed processed task file.")
else:
# The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed # The claimed_task_path_host has a .LOCKED suffix, remove it before adding .processed
base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0] base_path_str = str(claimed_task_path_host).rsplit('.LOCKED.', 1)[0]
processed_path = Path(f"{base_path_str}.processed") processed_path = Path(f"{base_path_str}.processed")
claimed_task_path_host.rename(processed_path) claimed_task_path_host.rename(processed_path)
logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.") logger.debug(f"[{sp_utils.get_display_name(claimed_task_path_host)}] Renamed processed task file to '{processed_path.name}'.")
except (OSError, IndexError) as e: except (OSError, IndexError) as e:
logger.error(f"Failed to rename processed task file '{claimed_task_path_host}': {e}") logger.error(f"Failed to clean up processed task file '{claimed_task_path_host}': {e}")
elif d_policy.get('rename_source_info_json_on_success'): elif d_policy.get('rename_source_info_json_on_success'):
# Queue-based mode: respect rename policy # Queue-based mode: respect rename policy
source_path_to_rename = task.get('info_json_path') source_path_to_rename = task.get('info_json_path')
@ -1372,11 +1427,25 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
# but the task is being finalized, we must assume all potential downloads for this task # but the task is being finalized, we must assume all potential downloads for this task
# are "processed" to prevent the auth profile from getting stuck. # are "processed" to prevent the auth profile from getting stuck.
if downloads_processed_in_task == 0: if downloads_processed_in_task == 0:
logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.") logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using task metadata to determine decrement count to avoid stuck profile.")
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', '')) decrement_count = 1 # Default to 1 to be safe
num_formats = formats_str.count(',') + 1 if formats_str else 1 metadata = info_data.get('_ytops_metadata', {})
downloads_processed_in_task = num_formats granularity = metadata.get('download_task_granularity')
formats_requested = metadata.get('formats_requested') # Can be None
if granularity == 'per_format':
# Each task file represents one format group, so decrement by 1.
decrement_count = 1
elif granularity == 'per_url':
# The task file represents all formats for a URL.
decrement_count = len(formats_requested) if formats_requested else 1
else:
# No granularity info, this may be an older task file.
# Assume it's a single download task.
decrement_count = 1
downloads_processed_in_task = decrement_count
logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}") logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}")
if downloads_processed_in_task > 0: if downloads_processed_in_task > 0:

View File

@ -30,192 +30,18 @@ from .queue_provider import RedisQueueProvider
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock): def _process_single_auth_result(success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url, worker_id, policy, state_manager, args, profile_manager_instance, info_json_path=None):
"""Worker function for processing authentication tasks from a queue in batches.""" """
owner_id = f"queue-auth-worker-{worker_id}" Helper to process the result of a single authentication attempt, whether it was
part of a batch or a single run.
"""
settings = policy.get('settings', {}) settings = policy.get('settings', {})
exec_control = policy.get('execution_control', {})
gen_policy = policy.get('info_json_generation_policy', {})
queue_policy = policy.get('queue_policy', {}) queue_policy = policy.get('queue_policy', {})
gen_policy = policy.get('info_json_generation_policy', {})
task_granularity = gen_policy.get('download_task_granularity', 'per_format') task_granularity = gen_policy.get('download_task_granularity', 'per_format')
profile_prefix = gen_policy.get('profile_prefix')
if not profile_prefix:
logger.error(f"[Worker {worker_id}] Queue auth mode requires 'info_json_generation_policy.profile_prefix'. Worker exiting.")
return []
save_dir = settings.get('save_info_json_dir') save_dir = settings.get('save_info_json_dir')
if not save_dir and queue_policy.get('formats_to_download'):
save_dir = os.path.join('run', 'stress_policy', 'info_jsons')
logger.info(f"[Worker {worker_id}] 'formats_to_download' is set and 'save_info_json_dir' is not, defaulting to '{save_dir}'")
if save_dir:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"[Worker {worker_id}] Will save info.json files to '{save_dir}'")
batch_size = queue_policy.get('batch_size', 1)
logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
tasks = []
try:
# 1. Get a batch of tasks from the queue FIRST
tasks = state_manager.get_auth_tasks_batch(batch_size)
if not tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(base_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 2. Lock a profile, trying any of the specified prefixes
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
state_manager.add_auth_tasks_batch(tasks)
time.sleep(polling_interval)
continue
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
# 3. Process each task in the batch
for task in tasks:
if state_manager.shutdown_event.is_set():
logger.info(f"[Worker {worker_id}] Shutdown requested, stopping batch processing for profile '{profile_name}'.")
break
temp_task_dir = None
task_id = None
url = None
try:
temp_task_dir = tempfile.mkdtemp(prefix=f"queue-auth-{worker_id}-")
task_id = task.get('id') or task.get('task_id')
if not task_id:
task_id = f"task_{worker_id}_{task_counter}"
task_counter += 1
task['task_id'] = task_id
url = task.get('url') url = task.get('url')
if not url:
logger.error(f"[Worker {worker_id}] Task {task_id} has no URL. Skipping.")
state_manager.report_auth_skipped(task_id, {"error": "No URL in task", "task": task})
continue
logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task {task_id}: {url}")
state_manager.mark_auth_in_progress(task_id, owner_id)
# --- Main processing logic for a single task ---
success, info_data, stderr, retcode = False, None, "", 0
if args.dummy or args.dummy_batch:
logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}")
dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
min_seconds = dummy_settings.get('auth_min_seconds', 0.1)
max_seconds = dummy_settings.get('auth_max_seconds', 0.5)
failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0)
skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0)
time.sleep(random.uniform(min_seconds, max_seconds))
rand_val = random.random()
if rand_val < skipped_rate:
stderr = "Dummy skipped failure"
elif rand_val < (skipped_rate + failure_rate):
stderr = "Dummy fatal failure"
else:
success = True
# In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id'
try:
# First, try the robust library function
video_id = sp_utils.get_video_id(url)
if not video_id and url:
# Fallback parsing for dummy URLs like "...?v=dummy_0099"
parsed_url = urlparse(url)
video_id = parse_qs(parsed_url.query).get('v', [None])[0]
# Additional fallback for URLs like "youtube.com/watch?v=dummy_0028"
if not video_id and 'dummy_' in url:
dummy_match = re.search(r'dummy_\d+', url)
if dummy_match:
video_id = dummy_match.group(0)
except Exception:
video_id = None # Ensure video_id is None on parsing failure
# If all extraction methods fail, use the task_id as a reliable fallback.
if not video_id:
logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.")
video_id = task_id
info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
else:
client, req_params = state_manager.get_client_for_request(profile_name, gen_policy)
cmd = [
sys.executable, '-m', 'ytops_client.cli', 'get-info',
'--client', client or '', '--profile', profile_name
]
if proxy_url:
cmd.extend(['--proxy', proxy_url])
if req_params:
cmd.extend(['--request-params', json.dumps(req_params)])
extra_args = gen_policy.get('extra_args')
if extra_args:
cmd.extend(shlex.split(extra_args))
# The URL must be the last positional argument
cmd.append(url)
logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}")
retcode, stdout, stderr = run_command(
cmd, running_processes, process_lock, stream_output=args.verbose,
stream_prefix=f"[Worker {worker_id} | get-info] "
)
success = (retcode == 0)
if success:
info_json_path = next((line.strip() for line in stdout.strip().split('\n') if line.endswith('.json') and os.path.exists(line.strip())), None)
if info_json_path:
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
info_data = json.load(f)
except (IOError, json.JSONDecodeError) as e:
logger.error(f"[Worker {worker_id}] Failed to read/parse info.json from get-info: {e}")
success = False
stderr += f"\nFailed to read/parse info.json: {e}"
else:
logger.error(f"[Worker {worker_id}] Command succeeded but no info.json path found in output.")
success = False
stderr += "\nNo info.json path in output"
# --- Result processing for a single task ---
if success and info_data: if success and info_data:
try: try:
auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '')
@ -360,13 +186,14 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
else: else:
is_bot_error = "Sign in to confirm you're not a bot" in stderr is_bot_error = "Sign in to confirm you're not a bot" in stderr
is_timeout_error = "Read timed out" in stderr is_timeout_error = "Read timed out" in stderr
is_proxy_error = "ProxyError" in stderr or "Unable to connect to proxy" in stderr
is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr
is_private = "This video is private" in stderr or "Private video" in stderr is_private = "This video is private" in stderr or "Private video" in stderr
is_deleted = "This video has been removed" in stderr is_deleted = "This video has been removed" in stderr
is_dummy_skipped = "Dummy skipped failure" in stderr is_dummy_skipped = "Dummy skipped failure" in stderr
if is_unavailable or is_private or is_deleted or is_dummy_skipped: if is_unavailable or is_private or is_deleted or is_dummy_skipped or is_proxy_error:
reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped" reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Proxy error" if is_proxy_error else "Dummy skipped"
logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr}) state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr})
@ -376,14 +203,309 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode}) state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode})
def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manager_instance, running_processes, process_lock):
"""Worker function for processing authentication tasks from a queue in batches."""
owner_id = f"queue-auth-worker-{worker_id}"
settings = policy.get('settings', {})
exec_control = policy.get('execution_control', {})
gen_policy = policy.get('info_json_generation_policy', {})
queue_policy = policy.get('queue_policy', {})
docker_policy = policy.get('direct_docker_cli_policy', {})
task_granularity = gen_policy.get('download_task_granularity', 'per_format')
profile_prefix = gen_policy.get('profile_prefix')
if not profile_prefix:
logger.error(f"[Worker {worker_id}] Queue auth mode requires 'info_json_generation_policy.profile_prefix'. Worker exiting.")
return []
save_dir = settings.get('save_info_json_dir')
if not save_dir and queue_policy.get('formats_to_download'):
save_dir = os.path.join('run', 'stress_policy', 'info_jsons')
logger.info(f"[Worker {worker_id}] 'formats_to_download' is set and 'save_info_json_dir' is not, defaulting to '{save_dir}'")
if save_dir:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"[Worker {worker_id}] Will save info.json files to '{save_dir}'")
batch_size = queue_policy.get('batch_size', 1)
logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
tasks = []
try:
# 1. Get a batch of tasks from the queue FIRST
raw_tasks = state_manager.get_auth_tasks_batch(batch_size)
if not raw_tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(base_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
continue
# Normalize tasks to handle both raw URL strings and JSON objects
tasks = []
for raw_task in raw_tasks:
if isinstance(raw_task, str):
tasks.append({'url': raw_task})
elif isinstance(raw_task, dict) and raw_task.get('url'):
tasks.append(raw_task)
else:
logger.warning(f"[Worker {worker_id}] Skipping invalid task of type {type(raw_task)}: {raw_task}")
if not tasks:
logger.warning(f"[Worker {worker_id}] All tasks in batch were invalid. Waiting for next batch.")
time.sleep(1)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 2. Lock a profile, trying any of the specified prefixes
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
state_manager.add_auth_tasks_batch(tasks)
time.sleep(polling_interval)
continue
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
# 3. Process the entire batch
if args.dummy or args.dummy_batch:
# Dummy mode is not optimized for batching, so we process tasks individually.
logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Processing {len(tasks)} tasks individually.")
for task in tasks:
if state_manager.shutdown_event.is_set():
break
task_id = task.get('id') or task.get('task_id') or f"task_{worker_id}_{task_counter}"
url = task.get('url')
# Dummy simulation logic copied from single-task implementation
logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY MODE: Simulating auth for {url}")
dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
min_seconds = dummy_settings.get('auth_min_seconds', 0.1)
max_seconds = dummy_settings.get('auth_max_seconds', 0.5)
failure_rate = args.dummy_auth_failure_rate or dummy_settings.get('auth_failure_rate', 0.0)
skipped_rate = args.dummy_auth_skipped_failure_rate or dummy_settings.get('auth_skipped_failure_rate', 0.0)
time.sleep(random.uniform(min_seconds, max_seconds))
success, info_data, stderr, retcode = False, None, "", 0
rand_val = random.random()
if rand_val < skipped_rate:
stderr = "Dummy skipped failure"
elif rand_val < (skipped_rate + failure_rate):
stderr = "Dummy fatal failure"
else:
success = True
video_id = sp_utils.get_video_id(url) or task_id
info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
_process_single_auth_result(
success, info_data, stderr, retcode, task, task_id, profile_name, proxy_url,
worker_id, policy, state_manager, args, profile_manager_instance
)
else:
# BATCH MODE: Process all tasks in a single Docker run
batch_temp_dir = None
try:
# Create a temporary directory for this batch's files
host_mount_path = docker_policy.get('docker_host_mount_path')
temp_parent_dir = host_mount_path if host_mount_path and os.path.isdir(host_mount_path) else None
batch_temp_dir = tempfile.mkdtemp(prefix=f"queue-auth-batch-{worker_id}-", dir=temp_parent_dir)
task_dir_name = os.path.basename(batch_temp_dir)
task_dir_container = os.path.join(docker_policy.get('docker_container_mount_path'), task_dir_name)
# Map URLs to their original tasks for post-processing
url_to_task_map = {task.get('url'): task for task in tasks if task.get('url')}
# Create a batch file with all URLs
batch_file_host_path = os.path.join(batch_temp_dir, 'batch_urls.txt')
with open(batch_file_host_path, 'w', encoding='utf-8') as f:
for url in url_to_task_map.keys():
f.write(f"{url}\n")
# Mark all tasks in the batch as in-progress
for task in tasks:
task_id = task.get('id') or task.get('task_id')
if not task_id:
# Generate a temporary, worker-unique ID if none exists.
task_id = f"task_{worker_id}_{int(time.time()*1000)}_{task_counter}"
task['task_id'] = task_id # Store it back for later stages
task_counter += 1
logger.warning(f"[Worker {worker_id}] Task missing 'id' or 'task_id'. Generated a temporary one: {task_id}. Task content: {task}")
state_manager.mark_auth_in_progress(task_id, owner_id)
# Docker batch processing logic
image_name = docker_policy.get('docker_image_name')
if not image_name:
raise ValueError("'direct_docker_cli_policy.docker_image_name' is not defined.")
network_name = docker_policy.get('docker_network_name')
container_mount_path = docker_policy.get('docker_container_mount_path')
host_cache_path = docker_policy.get('docker_host_cache_path')
container_cache_path = docker_policy.get('docker_container_cache_path')
volumes = {}
if host_mount_path and container_mount_path:
volumes[str(Path(host_mount_path).resolve())] = {'bind': container_mount_path, 'mode': 'rw'}
if host_cache_path and container_cache_path:
Path(host_cache_path).mkdir(parents=True, exist_ok=True)
volumes[str(Path(host_cache_path).resolve())] = {'bind': container_cache_path, 'mode': 'rw'}
environment = {}
if host_cache_path and container_cache_path:
environment['XDG_CACHE_HOME'] = container_cache_path
# --- Build config file content ---
base_config_content = ""
base_config_file = docker_policy.get('ytdlp_config_file')
if base_config_file:
config_path_to_read = Path(base_config_file)
if not config_path_to_read.exists():
config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file
if config_path_to_read.exists():
try:
with open(config_path_to_read, 'r', encoding='utf-8') as f:
base_config_content = f.read()
except IOError as e:
logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}")
else:
logger.error(f"[Worker {worker_id}] Could not find ytdlp_config_file: '{base_config_file}'")
config_overrides = docker_policy.get('ytdlp_config_overrides', {}).copy()
config_overrides['proxy'] = proxy_url
config_overrides['batch-file'] = os.path.join(task_dir_container, 'batch_urls.txt')
config_overrides['output'] = os.path.join(task_dir_container, '%(id)s')
overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides)
raw_args_from_policy = docker_policy.get('ytdlp_raw_args', [])
raw_args_content = '\n'.join(raw_args_from_policy)
config_content = f"{base_config_content.strip()}\n\n# --- Overrides from policy ---\n{overrides_content}"
if raw_args_content:
config_content += f"\n\n# --- Raw args from policy ---\n{raw_args_content}"
# Write the combined config to a file in the temp batch dir
ytdlp_config_dir_host = os.path.join(batch_temp_dir, 'yt-dlp')
os.makedirs(ytdlp_config_dir_host, exist_ok=True)
temp_config_file_host = os.path.join(ytdlp_config_dir_host, 'config')
with open(temp_config_file_host, 'w', encoding='utf-8') as f:
f.write(config_content)
# The command tells yt-dlp exactly where to find the config file we created.
cmd = ['yt-dlp', '--config-locations', os.path.join(task_dir_container, 'yt-dlp/config')]
processed_urls = set()
activity_lock = threading.Lock()
def log_parser_callback(line):
# Not thread-safe, but only one thread writes to this set
nonlocal processed_urls
if '[info] Writing video metadata as JSON to:' in line:
try:
path_match = re.search(r"Writing video metadata as JSON to: '?([^']+)'?$", line)
if not path_match:
path_match = re.search(r"Writing video metadata as JSON to: (.*)$", line)
if path_match:
container_file_path = path_match.group(1).strip()
if container_file_path.startswith(container_mount_path):
relative_path = os.path.relpath(container_file_path, container_mount_path)
host_file_path = os.path.join(host_mount_path, relative_path)
# Wait for file to exist
for _ in range(5):
if os.path.exists(host_file_path): break
time.sleep(0.1)
if os.path.exists(host_file_path):
with open(host_file_path, 'r', encoding='utf-8') as f:
info_data = json.load(f)
original_url = info_data.get('original_url') or info_data.get('webpage_url')
if original_url in url_to_task_map:
with activity_lock:
if original_url not in processed_urls:
processed_urls.add(original_url)
task = url_to_task_map[original_url]
task_id = task.get('id') or task.get('task_id')
_process_single_auth_result(
True, info_data, "", 0, task, task_id, profile_name, proxy_url,
worker_id, policy, state_manager, args, profile_manager_instance,
info_json_path=host_file_path
)
else:
logger.warning(f"[Worker {worker_id}] Could not map info.json '{Path(host_file_path).name}' with URL '{original_url}' to an original task. Skipping.")
except (IOError, json.JSONDecodeError, KeyError) as e:
logger.error(f"[Worker {worker_id}] Error during live post-processing from log line: {e}")
return False # Don't stop the container
logger.info(f"[Worker {worker_id}] [{profile_name}] Running Docker container for batch of {len(tasks)} URLs.")
retcode, stdout, stderr, _ = run_docker_container(
image_name=image_name, command=cmd, volumes=volumes,
environment=environment, network_name=network_name,
stream_prefix=f"[Worker {worker_id} | Docker] ",
log_callback=log_parser_callback
)
# --- Post-processing for failed URLs ---
unprocessed_urls = set(url_to_task_map.keys()) - processed_urls
if unprocessed_urls:
logger.warning(f"[Worker {worker_id}] [{profile_name}] {len(unprocessed_urls)} URLs from the batch did not produce an info.json and will be marked as failed.")
for url in unprocessed_urls:
task = url_to_task_map[url]
task_id = task.get('id') or task.get('task_id')
_process_single_auth_result(
False, None, stderr, retcode, task, task_id, profile_name, proxy_url,
worker_id, policy, state_manager, args, profile_manager_instance
)
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error during batch processing: {e}", exc_info=True)
if task_id: for task in tasks:
state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"}) task_id = task.get('id') or task.get('task_id')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) _process_single_auth_result(
False, None, f"Unexpected batch error: {str(e)}", -1, task, task_id, profile_name, proxy_url,
worker_id, policy, state_manager, args, profile_manager_instance
)
finally: finally:
if temp_task_dir and os.path.exists(temp_task_dir): if batch_temp_dir and os.path.exists(batch_temp_dir):
shutil.rmtree(temp_task_dir) shutil.rmtree(batch_temp_dir)
for task in tasks:
task_id = task.get('id') or task.get('task_id')
if task_id: if task_id:
state_manager.remove_auth_in_progress(task_id) state_manager.remove_auth_in_progress(task_id)
@ -437,24 +559,41 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
settings = policy.get('settings', {}) settings = policy.get('settings', {})
exec_control = policy.get('execution_control', {}) exec_control = policy.get('execution_control', {})
d_policy = policy.get('download_policy', {}) d_policy = policy.get('download_policy', {})
direct_policy = policy.get('direct_docker_cli_policy', {})
queue_policy = policy.get('queue_policy', {}) queue_policy = policy.get('queue_policy', {})
profile_prefix = d_policy.get('profile_prefix') profile_prefix = d_policy.get('profile_prefix')
if not profile_prefix: if not profile_prefix:
logger.error(f"[Worker {worker_id}] Queue download mode requires 'download_policy.profile_prefix'. Worker exiting.") logger.error(f"[Worker {worker_id}] Queue download mode requires a profile prefix, but none was provided to the worker. Check 'execution_control' in policy. Worker exiting.")
return [] return []
output_dir = d_policy.get('output_dir') # --- Docker specific config for download worker ---
if output_dir: image_name = direct_policy.get('docker_image_name')
os.makedirs(output_dir, exist_ok=True) host_mount_path = direct_policy.get('docker_host_mount_path')
logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'") container_mount_path = direct_policy.get('docker_container_mount_path')
host_download_path = direct_policy.get('docker_host_download_path')
container_download_path = direct_policy.get('docker_container_download_path')
network_name = direct_policy.get('docker_network_name')
if not all([image_name, host_mount_path, container_mount_path, host_download_path, container_download_path]):
logger.error(f"[Worker {worker_id}] Queue docker download mode requires all docker_* keys in 'direct_docker_cli_policy'. Worker exiting.")
return []
try:
os.makedirs(host_mount_path, exist_ok=True)
os.makedirs(host_download_path, exist_ok=True)
except OSError as e:
logger.error(f"[Worker {worker_id}] Could not create required host directories: {e}. Worker exiting.")
return []
task_counter = 0 task_counter = 0
last_no_task_log_msg = "" last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
temp_task_dir = None temp_task_dir = None # Used by dummy mode
temp_config_dir_host = None # Used by docker mode
was_banned_by_parser = False
task = None task = None
auth_profile_name, auth_env = None, None auth_profile_name, auth_env = None, None
@ -594,105 +733,132 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
# race conditions between workers processing different formats for the # race conditions between workers processing different formats for the
# same video. The files can be cleaned up manually between runs. # same video. The files can be cleaned up manually between runs.
else: else:
cmd = [ retcode = -1; stdout = ""; stderr = ""; stop_reason = None
sys.executable, '-m', 'ytops_client.cli', 'download', 'py', live_success_count, live_failure_count, live_tolerated_count = 0, 0, 0
'--load-info-json', info_json_path,
'-f', format_id
]
if proxy_url:
cmd.extend(['--proxy', proxy_url])
if output_dir:
cmd.extend(['--output-dir', output_dir])
extra_args = d_policy.get('extra_args')
if extra_args:
cmd.extend(shlex.split(extra_args))
logger.info(f"[Worker {worker_id}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") # --- Pre-flight checks ---
retcode, stdout, stderr = run_command( if d_policy.get('check_url_expiration', True):
cmd, running_processes, process_lock, stream_output=args.verbose, try:
stream_prefix=f"[Worker {worker_id} | download] " with open(info_json_path, 'r', encoding='utf-8') as f: info_data = json.load(f)
first_format = next((f for f in info_data.get('formats', []) if 'url' in f), None)
if first_format:
status, _ = sp_utils.check_url_expiry(first_format['url'], d_policy.get('expire_time_shift_minutes', 0))
if status == 'expired': stderr = "Download URL is expired"
except (IOError, json.JSONDecodeError) as e: stderr = f"Could not read info.json: {e}"
if not stderr:
relative_task_path = os.path.relpath(os.path.abspath(info_json_path), host_mount_path)
task_path_container = os.path.join(container_mount_path, relative_task_path)
temp_config_dir_host = tempfile.mkdtemp(prefix=f"docker-dl-config-{worker_id}-", dir=host_mount_path)
config_dir_container = os.path.join(container_mount_path, os.path.basename(temp_config_dir_host))
base_config_content = ""
base_config_file = direct_policy.get('ytdlp_config_file')
if base_config_file:
config_path_to_read = Path(base_config_file)
if not config_path_to_read.exists(): config_path_to_read = Path(sp_utils._PROJECT_ROOT) / base_config_file
if config_path_to_read.exists():
try:
with open(config_path_to_read, 'r', encoding='utf-8') as f: base_config_content = f.read()
except IOError as e: logger.error(f"[Worker {worker_id}] Could not read ytdlp_config_file '{config_path_to_read}': {e}")
config_overrides = direct_policy.get('ytdlp_config_overrides', {}).copy()
config_overrides.update({
'proxy': proxy_url, 'load-info-json': task_path_container,
'output': os.path.join(container_download_path, '%(id)s.f%(format_id)s.%(ext)s'),
'format': format_id, 'no-cache-dir': True
})
overrides_content = sp_utils._config_dict_to_flags_file_content(config_overrides)
raw_args_content = '\n'.join(direct_policy.get('ytdlp_raw_args', []))
config_content = f"{base_config_content.strip()}\n\n# --- Overrides ---\n{overrides_content}\n{raw_args_content}"
ytdlp_config_dir_host = os.path.join(temp_config_dir_host, 'yt-dlp')
os.makedirs(ytdlp_config_dir_host, exist_ok=True)
with open(os.path.join(ytdlp_config_dir_host, 'config'), 'w', encoding='utf-8') as f: f.write(config_content)
volumes = {
os.path.abspath(host_mount_path): {'bind': container_mount_path, 'mode': 'ro'},
os.path.abspath(host_download_path): {'bind': container_download_path, 'mode': 'rw'}
}
command = ['yt-dlp', '--config-locations', os.path.join(config_dir_container, 'yt-dlp/config')]
activity_lock = threading.Lock()
tolerated_error_patterns = direct_policy.get('tolerated_error_patterns', [])
fatal_error_patterns = direct_policy.get('fatal_error_patterns', [])
def log_parser_callback(line):
nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser
if '[download] 100% of' in line or 'has already been downloaded' in line:
with activity_lock: live_success_count += 1; profile_manager_instance.record_activity(profile_name, 'download', is_dummy=False)
return False
for pattern in fatal_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock:
live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False)
if direct_policy.get('ban_on_fatal_error_in_batch'):
profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download')
was_banned_by_parser = True; return True
return False
if 'ERROR:' not in line: return False
for pattern in tolerated_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock: live_tolerated_count += 1; profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=False)
return False
with activity_lock: live_failure_count += 1; profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=False)
return False
retcode, stdout, stderr, stop_reason = run_docker_container(
image_name=image_name, command=command, volumes=volumes, stream_prefix=f"[Worker {worker_id} | docker-ytdlp] ",
network_name=network_name, log_callback=log_parser_callback, profile_manager=profile_manager_instance,
profile_name=profile_name, environment={}
) )
success = (retcode == 0)
if success: success = live_success_count > 0 or (retcode == 0 and not stop_reason and live_failure_count == 0)
for line in stdout.strip().split('\n'):
if os.path.exists(line.strip()):
downloaded_filepath = line.strip()
break
if success: if success:
# --- Airflow Integration --- downloaded_filepath = None
if d_policy.get('output_to_airflow_ready_dir'): if d_policy.get('output_to_airflow_ready_dir'):
base_path = d_policy.get('airflow_ready_dir_base_path') base_path = d_policy.get('airflow_ready_dir_base_path')
if not base_path: if not base_path:
logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set. Skipping Airflow output.") logger.error(f"[Worker {worker_id}] 'output_to_airflow_ready_dir' is true but 'airflow_ready_dir_base_path' is not set.")
else: else:
try: try:
# Create a unique, timestamped directory for the video
ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S_%f') ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S_%f')
final_dir = os.path.join(base_path, ts + '_' + (video_id or 'unknown_video')) final_dir = os.path.join(base_path, ts + '_' + (video_id or 'unknown_video'))
os.makedirs(final_dir, exist_ok=True) os.makedirs(final_dir, exist_ok=True)
# Copy info.json to avoid a race condition where multiple download workers
# for different formats try to move the same source file.
if info_json_path and os.path.exists(info_json_path): if info_json_path and os.path.exists(info_json_path):
final_info_json_path = os.path.join(final_dir, os.path.basename(info_json_path)) shutil.copy(info_json_path, os.path.join(final_dir, os.path.basename(info_json_path)))
shutil.copy(info_json_path, final_info_json_path)
logger.info(f"[Worker {worker_id}] Copied info.json to Airflow-ready dir: {final_info_json_path}")
# Update the path for reporting to point to the new copy
info_json_path = final_info_json_path
# Move downloaded file (if not in dummy mode) downloaded_files = [f for f in os.listdir(host_download_path) if not f.endswith(('.part', '.ytdl'))]
if downloaded_filepath and os.path.exists(downloaded_filepath): if not downloaded_files: logger.warning(f"No media files found in '{host_download_path}' for video '{video_id}'.")
final_media_path = os.path.join(final_dir, os.path.basename(downloaded_filepath))
shutil.move(downloaded_filepath, final_media_path)
logger.info(f"[Worker {worker_id}] Moved media file to Airflow-ready dir: {final_media_path}")
# Update the path for reporting
downloaded_filepath = final_media_path
except Exception as e: for filename in downloaded_files:
logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True) final_media_path = os.path.join(final_dir, filename)
shutil.move(os.path.join(host_download_path, filename), final_media_path)
if downloaded_filepath is None: downloaded_filepath = final_media_path
except Exception as e: logger.error(f"Failed to move files to Airflow-ready dir: {e}", exc_info=True)
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_success(task_id, { state_manager.report_download_success(task_id, {
"video_id": video_id, "url": url, "format_id": format_id, "video_id": video_id, "url": url, "format_id": format_id, "profile_name": profile_name, "proxy_url": proxy_url,
"profile_name": profile_name, "proxy_url": proxy_url,
"downloaded_filepath": downloaded_filepath, "info_json_path": info_json_path "downloaded_filepath": downloaded_filepath, "info_json_path": info_json_path
}) })
logger.info(f"[Worker {worker_id}] Download successful: {video_id or url} format {format_id}") logger.info(f"[Worker {worker_id}] Download successful: {video_id or url} format {format_id}")
# --- Rename source info.json to mark as processed ---
if d_policy.get('rename_source_info_json_on_success'): if d_policy.get('rename_source_info_json_on_success'):
# Use the original path from the task, as the `info_json_path` variable source_path = task.get('info_json_path')
# may have been updated by the Airflow logic to point to a copy. if source_path and os.path.exists(source_path):
source_path_to_rename = task.get('info_json_path') try: shutil.move(source_path, source_path + ".processed")
if source_path_to_rename and os.path.exists(source_path_to_rename): except Exception as e: logger.warning(f"Could not rename source info.json '{source_path}': {e}")
try:
processed_path = source_path_to_rename + ".processed"
shutil.move(source_path_to_rename, processed_path)
logger.info(f"[Worker {worker_id}] Renamed source info.json to '{processed_path}'")
except Exception as e:
logger.warning(f"[Worker {worker_id}] Could not rename source info.json '{source_path_to_rename}': {e}")
else: else:
is_bot_error = "Sign in to confirm you're not a bot" in stderr is_unavailable = "unavailable" in stderr.lower() or "expired" in stderr.lower()
is_timeout_error = "Read timed out" in stderr
is_unavailable = "This video is unavailable" in stderr or "Video unavailable" in stderr
is_format_error = "requested format not available" in stderr is_format_error = "requested format not available" in stderr
if is_unavailable or is_format_error: if is_unavailable or is_format_error:
logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}") reason = "Video/URL unavailable or expired" if is_unavailable else "Format not available"
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_skipped(task_id, {"video_id": video_id, "url": url, "format_id": format_id, "reason": reason, "stderr": stderr})
state_manager.report_download_skipped(task_id, {
"video_id": video_id, "url": url, "format_id": format_id,
"reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr
})
else: else:
error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}"
logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}")
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_failure(task_id, { state_manager.report_download_failure(task_id, {
"video_id": video_id, "url": url, "format_id": format_id, "video_id": video_id, "url": url, "format_id": format_id, "error_type": f"Exit code {retcode}",
"error_type": error_type, "stderr": stderr, "exit_code": retcode, "stderr": stderr, "exit_code": retcode, "original_task": task
"original_task": task
}) })
except Exception as e: except Exception as e:
@ -722,6 +888,9 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.") logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.")
if locked_profile: if locked_profile:
if was_banned_by_parser:
logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was banned by log parser. Skipping unlock.")
else:
cooldown = None cooldown = None
cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds')
if cooldown_config: if cooldown_config:
@ -732,7 +901,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
elif isinstance(val, int): elif isinstance(val, int):
cooldown = val cooldown = val
except (json.JSONDecodeError, TypeError): except (json.JSONDecodeError, TypeError):
if cooldown_config.isdigit(): if isinstance(cooldown_config, str) and cooldown_config.isdigit():
cooldown = int(cooldown_config) cooldown = int(cooldown_config)
if cooldown: if cooldown:
@ -747,6 +916,9 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
if temp_task_dir and os.path.exists(temp_task_dir): if temp_task_dir and os.path.exists(temp_task_dir):
shutil.rmtree(temp_task_dir) shutil.rmtree(temp_task_dir)
if temp_config_dir_host and os.path.exists(temp_config_dir_host):
shutil.rmtree(temp_config_dir_host)
if task and task_id: if task and task_id:
state_manager.remove_download_in_progress(task_id) state_manager.remove_download_in_progress(task_id)

View File

@ -130,6 +130,47 @@ process_lock = threading.Lock()
logger = logging.getLogger('stress_policy_tool') logger = logging.getLogger('stress_policy_tool')
def _discover_worker_pools(discovery_config, manager_for_discovery):
"""
Discovers worker pools by scanning profile prefixes in Redis.
Returns a list of worker pool configurations or None on error.
"""
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
if not discovery_pattern:
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.")
return None
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
return []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
logger.info("Note: Profile group discovery runs once at startup. A restart is required to detect new profile groups.")
return worker_pools
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return None
def main_stress_policy(args): def main_stress_policy(args):
"""Main logic for the 'stress-policy' command.""" """Main logic for the 'stress-policy' command."""
if args.list_policies: if args.list_policies:
@ -362,10 +403,21 @@ def main_stress_policy(args):
logger.info(f"\nSignal {signum} received, shutting down gracefully...") logger.info(f"\nSignal {signum} received, shutting down gracefully...")
shutdown_event.set() shutdown_event.set()
# Propagate signal to child processes to allow them to shut down gracefully.
with process_lock:
if running_processes:
logger.info(f"Propagating signal to {len(running_processes)} running subprocess(es)...")
for p in running_processes:
try:
# Send the same signal to the entire process group.
os.killpg(os.getpgid(p.pid), signum)
except (ProcessLookupError, PermissionError):
pass # Process already finished or we lack permissions
# Save state immediately to prevent loss on interrupt. # Save state immediately to prevent loss on interrupt.
logger.info("Attempting to save state before shutdown...") logger.info("Attempting to save state before shutdown...")
state_manager.close() state_manager.close()
logger.info("Shutdown requested. Allowing in-progress tasks to complete. No new tasks will be started. Press Ctrl+C again to force exit.") logger.info("Shutdown requested. Signalling in-progress tasks to terminate gracefully. No new tasks will be started. Press Ctrl+C again to force exit.")
else: else:
logger.info("Second signal received, forcing exit.") logger.info("Second signal received, forcing exit.")
# On second signal, forcefully terminate subprocesses. # On second signal, forcefully terminate subprocesses.
@ -579,6 +631,7 @@ def main_stress_policy(args):
logger.info(f"Starting/resuming from URL index {start_index + 1}.") logger.info(f"Starting/resuming from URL index {start_index + 1}.")
# The worker's get_next_url_batch will respect this starting index. # The worker's get_next_url_batch will respect this starting index.
logger.info(f"Task source file: {os.path.abspath(urls_file)}")
sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0 if args.dry_run: return 0
@ -586,13 +639,8 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
direct_policy = policy.get('direct_batch_cli_policy', {}) direct_policy = policy.get('direct_batch_cli_policy', {})
use_env = direct_policy.get('use_profile_env', 'auth') use_env = direct_policy.get('use_profile_env', 'auth')
manager_for_discovery = profile_managers.get(use_env) manager_for_discovery = profile_managers.get(use_env)
@ -601,35 +649,10 @@ def main_stress_policy(args):
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred during discovery
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -641,24 +664,60 @@ def main_stress_policy(args):
return 1 return 1
if args.profile_prefix: if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
original_pool_count = len(worker_pools)
filtered_pools = []
for pool in worker_pools: for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix pool_prefixes_str = pool.get('profile_prefix', '')
if not pool_prefixes_str:
continue
pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
if pool_prefixes.intersection(cli_prefixes):
filtered_pools.append(pool)
if len(filtered_pools) < original_pool_count:
logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
worker_pools = filtered_pools
if not worker_pools:
logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
worker_specs = [] worker_specs = []
if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
all_prefixes = []
for pool in worker_pools:
prefix_str = pool.get('profile_prefix', '')
if prefix_str:
all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
worker_policy = deepcopy(policy)
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str
worker_specs.append({
'func': run_direct_batch_worker,
'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
else:
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] if not prefix_str:
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue continue
for i in range(pool_workers): for i in range(pool_workers):
assigned_prefix = prefixes[i % len(prefixes)] if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({ worker_specs.append({
'func': run_direct_batch_worker, 'func': run_direct_batch_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
@ -742,6 +801,7 @@ def main_stress_policy(args):
if start_index > 0: if start_index > 0:
logger.info(f"Starting/resuming from URL index {start_index + 1}.") logger.info(f"Starting/resuming from URL index {start_index + 1}.")
logger.info(f"Task source file: {os.path.abspath(urls_file)}")
sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0 if args.dry_run: return 0
@ -749,13 +809,8 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
direct_policy = policy.get('direct_docker_cli_policy', {}) direct_policy = policy.get('direct_docker_cli_policy', {})
use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download')
manager_for_discovery = profile_managers.get(use_env) manager_for_discovery = profile_managers.get(use_env)
@ -764,35 +819,10 @@ def main_stress_policy(args):
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -804,22 +834,59 @@ def main_stress_policy(args):
return 1 return 1
if args.profile_prefix: if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
original_pool_count = len(worker_pools)
filtered_pools = []
for pool in worker_pools: for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix pool_prefixes_str = pool.get('profile_prefix', '')
if not pool_prefixes_str:
continue
pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
if pool_prefixes.intersection(cli_prefixes):
filtered_pools.append(pool)
if len(filtered_pools) < original_pool_count:
logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
worker_pools = filtered_pools
if not worker_pools:
logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
worker_specs = [] worker_specs = []
if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
all_prefixes = []
for pool in worker_pools:
prefix_str = pool.get('profile_prefix', '')
if prefix_str:
all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
worker_policy = deepcopy(policy)
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = final_prefix_str
worker_specs.append({
'func': run_direct_docker_worker,
'args': (0, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
else:
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] if not prefix_str:
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue continue
# Each worker in the pool gets the full list of prefixes from the pool configuration. # Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers): for i in range(pool_workers):
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes. # The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
@ -847,6 +914,7 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1 return 1
logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}")
sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
@ -854,13 +922,8 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
direct_policy = policy.get('direct_docker_cli_policy', {}) direct_policy = policy.get('direct_docker_cli_policy', {})
use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download') use_env = direct_policy.get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download')
manager_for_discovery = profile_managers.get(use_env) manager_for_discovery = profile_managers.get(use_env)
@ -869,35 +932,10 @@ def main_stress_policy(args):
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -909,22 +947,59 @@ def main_stress_policy(args):
return 1 return 1
if args.profile_prefix: if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
original_pool_count = len(worker_pools)
filtered_pools = []
for pool in worker_pools: for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix pool_prefixes_str = pool.get('profile_prefix', '')
if not pool_prefixes_str:
continue
pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
if pool_prefixes.intersection(cli_prefixes):
filtered_pools.append(pool)
if len(filtered_pools) < original_pool_count:
logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
worker_pools = filtered_pools
if not worker_pools:
logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
worker_specs = [] worker_specs = []
if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
all_prefixes = []
for pool in worker_pools:
prefix_str = pool.get('profile_prefix', '')
if prefix_str:
all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
worker_policy = deepcopy(policy)
worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str
worker_specs.append({
'func': run_direct_docker_download_worker,
'args': (0, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock)
})
else:
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] if not prefix_str:
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue continue
# Each worker in the pool gets the full list of prefixes from the pool configuration. # Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers): for i in range(pool_workers):
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes. # The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
@ -968,6 +1043,7 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1 return 1
logger.info(f"Task source directory: {os.path.abspath(info_json_dir)}")
sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
@ -975,48 +1051,18 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
manager_for_discovery = profile_managers.get('download') manager_for_discovery = profile_managers.get('download')
if not manager_for_discovery: if not manager_for_discovery:
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -1028,22 +1074,59 @@ def main_stress_policy(args):
return 1 return 1
if args.profile_prefix: if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set. Filtering worker pools to match this prefix.")
cli_prefixes = {p.strip() for p in args.profile_prefix.split(',') if p.strip()}
original_pool_count = len(worker_pools)
filtered_pools = []
for pool in worker_pools: for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix pool_prefixes_str = pool.get('profile_prefix', '')
if not pool_prefixes_str:
continue
pool_prefixes = {p.strip() for p in pool_prefixes_str.split(',') if p.strip()}
if pool_prefixes.intersection(cli_prefixes):
filtered_pools.append(pool)
if len(filtered_pools) < original_pool_count:
logger.info(f"Filtered {original_pool_count - len(filtered_pools)} pool(s) out based on --profile-prefix.")
worker_pools = filtered_pools
if not worker_pools:
logger.warning(f"After filtering with --profile-prefix '{args.profile_prefix}', no worker pools remain.")
worker_specs = [] worker_specs = []
if hasattr(args, 'workers') and args.workers == 1 and len(worker_pools) > 1:
logger.info("Single worker mode: aggregating all discovered profile groups for the worker.")
all_prefixes = []
for pool in worker_pools:
prefix_str = pool.get('profile_prefix', '')
if prefix_str:
all_prefixes.extend(p.strip() for p in prefix_str.split(',') if p.strip())
final_prefix_str = ','.join(sorted(list(set(all_prefixes))))
logger.info(f"Single worker will manage profile groups: {final_prefix_str}")
worker_policy = deepcopy(policy)
worker_policy.setdefault('download_policy', {})['profile_prefix'] = final_prefix_str
worker_specs.append({
'func': run_direct_download_worker,
'args': (0, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
})
else:
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] if not prefix_str:
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue continue
# Each worker in the pool gets the full list of prefixes from the pool configuration. # Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers): for i in range(pool_workers):
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes. # The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
@ -1129,48 +1212,18 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
manager_for_discovery = profile_managers.get('auth') manager_for_discovery = profile_managers.get('auth')
if not manager_for_discovery: if not manager_for_discovery:
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -1189,6 +1242,8 @@ def main_stress_policy(args):
worker_specs = [] worker_specs = []
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
@ -1198,6 +1253,8 @@ def main_stress_policy(args):
# Each worker in the pool gets the full list of prefixes from the pool configuration. # Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers): for i in range(pool_workers):
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes. # The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
@ -1285,48 +1342,18 @@ def main_stress_policy(args):
worker_pools = exec_control.get('worker_pools', []) worker_pools = exec_control.get('worker_pools', [])
discovery_config = exec_control.get('worker_pool_discovery') discovery_config = exec_control.get('worker_pool_discovery')
if discovery_config: if discovery_config and not worker_pools:
if worker_pools: logger.info("Explicit 'worker_pools' not defined. Using 'worker_pool_discovery' as a fallback.")
logger.warning("Both 'worker_pools' and 'worker_pool_discovery' are defined. 'worker_pool_discovery' will take precedence.")
discovery_pattern = discovery_config.get('profile_prefix_pattern')
workers_per_group = discovery_config.get('workers_per_profile_group', 1)
manager_for_discovery = profile_managers.get('download') manager_for_discovery = profile_managers.get('download')
if not manager_for_discovery: if not manager_for_discovery:
logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.") logger.error(f"Could not determine profile manager for worker pool discovery in mode '{orchestration_mode}/{mode}'.")
return 1 return 1
if not discovery_pattern: discovered_pools = _discover_worker_pools(discovery_config, manager_for_discovery)
logger.error("'worker_pool_discovery' is missing required key 'profile_prefix_pattern'.") if discovered_pools is None:
return 1 return 1 # An error occurred
worker_pools = discovered_pools
logger.info(f"Discovering worker pools from profile prefixes matching '{discovery_pattern}'...")
try:
all_profiles = manager_for_discovery.list_profiles()
found_prefixes = set()
for profile in all_profiles:
profile_name = profile['name']
if fnmatch.fnmatch(profile_name, discovery_pattern):
# Assuming standard name format like 'user31_001', extract 'user31'
prefix = profile_name.rsplit('_', 1)[0]
found_prefixes.add(prefix)
if not found_prefixes:
logger.warning(f"Worker pool discovery found no profiles matching pattern '{discovery_pattern}'. No workers will be started.")
worker_pools = []
else:
worker_pools = []
for prefix in sorted(list(found_prefixes)):
worker_pools.append({
'profile_prefix': prefix,
'workers': workers_per_group
})
logger.info(f"Discovered {len(found_prefixes)} profile groups, creating {workers_per_group} worker(s) for each: {', '.join(sorted(list(found_prefixes)))}")
except Exception as e:
logger.error(f"Failed to discover profile groups from Redis: {e}", exc_info=True)
return 1
if not worker_pools and exec_control.get('workers'): if not worker_pools and exec_control.get('workers'):
# Fallback for legacy 'workers: N' config # Fallback for legacy 'workers: N' config
@ -1345,6 +1372,8 @@ def main_stress_policy(args):
worker_specs = [] worker_specs = []
worker_id_counter = 0 worker_id_counter = 0
for pool in worker_pools: for pool in worker_pools:
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
pool_workers = pool.get('workers', 1) pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '') prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
@ -1354,6 +1383,8 @@ def main_stress_policy(args):
# Each worker in the pool gets the full list of prefixes from the pool configuration. # Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers): for i in range(pool_workers):
if hasattr(args, 'workers') and args.workers is not None and len(worker_specs) >= args.workers:
break
worker_policy = deepcopy(policy) worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes. # The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str