diff --git a/ansible/group_vars/all/vault.yml b/ansible/group_vars/all/vault.yml index c088bdf..705b721 100644 --- a/ansible/group_vars/all/vault.yml +++ b/ansible/group_vars/all/vault.yml @@ -1,7 +1,5 @@ vault_redis_password: "rOhTAIlTFFylXsjhqwxnYxDChFc" -vault_postgres_password: "pgdb_pwd_A7bC2xY9zE1wV5uP" -vault_airflow_admin_password: "2r234sdfrt3q454arq45q355" -vault_flower_password: "dO4eXm7UkF81OdMvT8E2tIKFtPYPCzyzwlcZ4RyOmCsmG4qzrNFqM5sNTOT9" +vault_redis_port: 52909 vault_vnc_password: "vnc_pwd_Z5xW8cV2bN4mP7lK" vault_ss_password_1: "UCUAR7vRO/u9Zo71nfA13c+/b1MCiJpfZJo+EmEBCfA=" vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4=" diff --git a/ansible/manage-processes-tasks.yml b/ansible/manage-processes-tasks.yml new file mode 100644 index 0000000..afc4d99 --- /dev/null +++ b/ansible/manage-processes-tasks.yml @@ -0,0 +1,102 @@ +--- +# This file is managed by include_tasks and is not a standalone playbook. +# It provides generic tasks for managing a process within a tmux session. +# Required variables: +# - tmux_session_name: The name of the tmux session. +# - working_dir: The directory where the command should be executed. +# - command_to_run: The command to execute inside the tmux session. +# - process_grep_pattern: A regex pattern to identify the process for pkill/status check. +# +# Optional variables (control flags): +# - start_process: (bool) Set to true to start the process. Default: false. +# - stop_process: (bool) Set to true to stop the process. Default: false. +# - check_status: (bool) Set to true to check and display the process status. Default: false. + +- name: Ensure tmux is installed + ansible.builtin.apt: + name: tmux + state: present + become: yes + run_once: true # No need to run this on every include + +- name: Stop existing tmux session + ansible.builtin.shell: + cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true" + when: stop_process | default(false) | bool + +- name: Kill any orphaned processes + ansible.builtin.shell: + cmd: | + PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}') + if [ -n "$PIDS" ]; then + kill $PIDS >/dev/null 2>&1 || true + sleep 0.5 + kill -9 $PIDS >/dev/null 2>&1 || true + fi + changed_when: false + when: stop_process | default(false) | bool + +- name: Stop existing process before starting (makes start idempotent) + block: + - name: Stop existing tmux session + ansible.builtin.shell: + cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true" + + - name: Kill any orphaned processes + ansible.builtin.shell: + cmd: | + PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}') + if [ -n "$PIDS" ]; then + kill $PIDS >/dev/null 2>&1 || true + sleep 0.5 + kill -9 $PIDS >/dev/null 2>&1 || true + fi + changed_when: false + when: start_process | default(false) | bool + +- name: Ensure client script is executable + ansible.builtin.file: + path: "{{ working_dir }}/bin/ytops-client" + mode: "a+x" + when: start_process | default(false) | bool + +- name: Display command for tmux session + ansible.builtin.debug: + msg: "Command for tmux session '{{ tmux_session_name }}': {{ command_to_run }}" + when: start_process | default(false) | bool + +- name: Start process in tmux session + ansible.builtin.shell: + cmd: | + cd {{ working_dir }} + # The command is wrapped with a final sleep to keep the tmux session alive for debugging even if the process fails. + # The actual command is run in a subshell (...) to prevent 'exec' from terminating the parent shell. + tmux new-session -d -s {{ tmux_session_name }} \ + "if [ -f .env ]; then set -a; . ./.env; set +a; fi; \ + COMMAND_TO_RUN='{{ command_to_run | replace("'", "'\\''") }}'; \ + echo '>>> Running command:'; \ + echo "\$COMMAND_TO_RUN"; \ + echo '---'; \ + (eval "\$COMMAND_TO_RUN") ; \ + echo; echo '---'; echo 'Process exited. This tmux session will remain open for debugging.'; echo 'You can attach with: tmux attach -t {{ tmux_session_name }}'; \ + sleep 3600" + when: start_process | default(false) | bool + +- name: Check process status + ansible.builtin.shell: + cmd: | + echo "Process status on {{ inventory_hostname }} for session '{{ tmux_session_name }}':" + if tmux has-session -t {{ tmux_session_name }} 2>/dev/null; then + echo " - Tmux session '{{ tmux_session_name }}' is running" + ps aux | grep -E "{{ process_grep_pattern }}" | grep -v grep || echo " - No matching process found" + else + echo " - Tmux session '{{ tmux_session_name }}' is NOT running" + fi + register: status_check + changed_when: false + when: check_status | default(false) | bool + +- name: Display status + ansible.builtin.debug: + msg: "{{ status_check.stdout_lines }}" + when: check_status | default(false) | bool diff --git a/ansible/playbook-README.md b/ansible/playbook-README.md new file mode 100644 index 0000000..9b67956 --- /dev/null +++ b/ansible/playbook-README.md @@ -0,0 +1,219 @@ +# Ansible Playbooks Documentation + +This document provides an overview of all available playbooks, their purpose, and how to use them operationally. + +## Table of Contents + +1. [Deployment Workflow](#deployment-workflow) +2. [Initial Setup Playbooks](#initial-setup-playbooks) +3. [Operational Playbooks](#operational-playbooks) +4. [Monitoring and Inspection](#monitoring-and-inspection) +5. [Common Operations](#common-operations) + +## Deployment Workflow + +The typical deployment workflow follows these steps: + +1. **Initial Setup**: Configure machines, deploy proxies, install dependencies +2. **Master Setup**: Configure Redis, MinIO, and other infrastructure services +3. **Worker Setup**: Configure auth generators and download simulators +4. **Operational Management**: Start/stop processes, monitor status, cleanup profiles + +## Initial Setup Playbooks + +For a complete, automated installation on fresh nodes, you can use the main `full-install` playbook: + +```bash +# Run the complete installation process on all nodes +ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini +``` + +The steps below are for running each part of the installation manually. + +### Base Installation + +```bash +# Deploy base system requirements to all nodes +ansible-playbook ansible/playbook-base-system.yml -i ansible/inventory.green.ini +``` + +### Proxy Deployment + +```bash +# Deploy shadowsocks proxies to all nodes (as defined in cluster config) +ansible-playbook ansible/playbook-proxies.yml -i ansible/inventory.green.ini +``` + +### Code, Environment, and Dependencies + +```bash +# Sync code to all nodes +ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini + +# Generate .env files for all nodes +ansible-playbook ansible/playbook-stress-generate-env.yml -i ansible/inventory.green.ini + +# Install dependencies on all nodes +ansible-playbook ansible/playbook-stress-install-deps.yml -i ansible/inventory.green.ini +``` + +## Operational Playbooks + +### Master Node Services + +Redis and MinIO are deployed as Docker containers during the initial setup (`playbook-full-install.yml`). + +To start the policy enforcer and monitoring tmux sessions on the master node: +```bash +# Start policy enforcer and monitoring on master +ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "start_enforcer=true start_monitor=true" +``` + +To stop processes on the master node: +```bash +# Stop ONLY the policy enforcer +ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_enforcer=true" + +# Stop ONLY the monitor +ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_monitor=true" + +# Stop BOTH the enforcer and monitor +ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_sessions=true" +``` + +### Worker Node Processes + +```bash +# Start auth generators and download simulators on all workers +ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start" + +# Stop all processes on workers +ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop" + +# Check status of all worker processes +ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=status" +``` + +### Profile Management + +```bash +# Clean up all profiles +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" + +# Clean up specific profile prefix +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1" +``` + +## Monitoring and Inspection + +### Status Checks + +```bash +# Check status of all processes on all nodes +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=status" + +# Check enforcer status on master +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=check-enforcer" + +# Check profile status +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=profile-status" +``` + +### Log Inspection + +```bash +# View tmux session output on a specific node +ssh user@hostname +tmux attach -t stress-auth-user1,user2 # For auth generator +tmux attach -t stress-download-user1,user2 # For download simulator +tmux attach -t stress-enforcer # For policy enforcer on master +``` + +## Common Operations + +### Code and Policy Updates + +First, sync your local changes to the jump host from your development machine: +```bash +# Sync project to jump host +./tools/sync-to-jump.sh +``` + +Then, from the jump host, you can sync code or policies to the cluster nodes: +```bash +# Sync all application code (Python sources, scripts, etc.) +ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini + +# Sync only policies and CLI configs +ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini +``` + +### Adding a New Worker + +1. Update `cluster.green.yml` with the new worker definition: + ```yaml + workers: + new-worker: + ip: x.x.x.x + port: 22 + profile_prefixes: + - "user4" + proxies: + - "sslocal-rust-1090" + ``` + +2. Regenerate inventory: + ```bash + ./tools/generate-inventory.py cluster.green.yml + ``` + +3. Run the full installation playbook, limiting it to the new worker: + ```bash + ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini --limit new-worker + ``` + +4. Start processes on the new worker: + ```bash + ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start" --limit new-worker + ``` + +### Removing a Worker + +1. Stop all processes on the worker: + ```bash + ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop" --limit worker-to-remove + ``` + +2. Remove the worker from `cluster.green.yml` + +3. Regenerate inventory: + ```bash + ./tools/generate-inventory.py cluster.green.yml + ``` + +### Emergency Stop All + +```bash +# Stop all processes on all nodes +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-all" +``` + +### Stopping Specific Nodes + +To stop processes on a specific worker or group of workers, you can use the `stop-nodes` action and limit the playbook run. + +```bash +# Stop all processes on a single worker (e.g., dl003) +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit dl003 + +# Stop all processes on all workers +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit workers +``` + +### Restart Enforcer and Monitoring + +```bash +# Restart monitoring and enforcer on master +ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring" +``` + diff --git a/ansible/playbook-base-system.yml b/ansible/playbook-base-system.yml index 2e6c7d7..d4716b4 100644 --- a/ansible/playbook-base-system.yml +++ b/ansible/playbook-base-system.yml @@ -3,9 +3,15 @@ hosts: all gather_facts: yes vars_files: - - "group_vars/all/generated_vars.stress.yml" # Assumes generate-inventory.py was run with cluster.stress.yml - "group_vars/all/vault.yml" pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" - name: Announce base system setup ansible.builtin.debug: msg: "Starting base system setup on {{ inventory_hostname }}" diff --git a/ansible/playbook-docker-services-setup.yml b/ansible/playbook-docker-services-setup.yml index 45227fe..c7b32c6 100644 --- a/ansible/playbook-docker-services-setup.yml +++ b/ansible/playbook-docker-services-setup.yml @@ -20,12 +20,19 @@ hosts: all gather_facts: no vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Define base directory for node ansible.builtin.set_fact: - base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" - name: Ensure base directories and subdirectories exist ansible.builtin.file: @@ -41,12 +48,12 @@ - "run/docker_mount/fetched_info_jsons" become: yes -- name: "PHASE 2.2: Import playbook to install Python dependencies" - import_playbook: playbook-stress-install-deps.yml - -- name: "PHASE 2.3: Import playbook to sync local code" +- name: "PHASE 2.2: Import playbook to sync local code" import_playbook: playbook-stress-sync-code.yml +- name: "PHASE 2.3: Import playbook to install Python dependencies" + import_playbook: playbook-stress-install-deps.yml + # ------------------------------------------------------------------------------------------------- # PHASE 3: Environment and Service Configuration # Generates the .env file and starts the role-specific services on master and workers. @@ -55,11 +62,17 @@ import_playbook: playbook-stress-generate-env.yml - name: "PHASE 3.2: Master Node Services Setup" - hosts: airflow_master + hosts: master gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Configure system performance and kernel settings ansible.builtin.copy: @@ -94,6 +107,16 @@ mode: '0644' become: yes + - name: Stop and remove existing containers before starting services + ansible.builtin.shell: + cmd: | + docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + become: yes + changed_when: false + ignore_errors: yes + - name: Start master services (Redis, MinIO) community.docker.docker_compose_v2: project_src: "{{ airflow_master_dir }}" @@ -148,16 +171,63 @@ environment: HOME: "/home/{{ ansible_user }}" -- name: "PHASE 3.3: Shared Storage Setup (s3fs)" - hosts: airflow_master:airflow_workers +- name: "PHASE 3.2a: Worker Node Services Setup" + hosts: workers gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Template Docker Compose file for worker services + ansible.builtin.template: + src: templates/docker-compose.stress-master.j2 + dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0644' + become: yes + + - name: Stop and remove existing containers before starting services + ansible.builtin.shell: + cmd: | + docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + become: yes + changed_when: false + ignore_errors: yes + + - name: Start worker services + community.docker.docker_compose_v2: + project_src: "{{ airflow_worker_dir }}" + files: + - docker-compose.stress.yml + state: present + remove_orphans: true + become: yes + +- name: "PHASE 3.3: Shared Storage Setup (s3fs)" + hosts: master:workers + gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Define base directory for node ansible.builtin.set_fact: - base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" - name: Mount S3 buckets via s3fs block: @@ -176,14 +246,124 @@ mode: '0600' become: yes + - name: Check if mount points are already mounted + ansible.builtin.shell: + cmd: "mount | grep -q '{{ item.path }}'" + loop: + - { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' } + - { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' } + register: mount_check + changed_when: false + failed_when: false + + - name: Ensure mount point directories exist (only if not mounted) + ansible.builtin.file: + path: "{{ item.item.path }}" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + loop: "{{ mount_check.results }}" + when: item.rc != 0 + become: yes + - name: Mount S3 buckets for stress testing ansible.posix.mount: src: "s3fs#{{ item.bucket }}" path: "{{ item.path }}" fstype: fuse - opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['airflow_master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs" + opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs" state: mounted loop: - { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' } - { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' } become: yes + +- name: "PHASE 3.4: Import playbook to initialize Redis profiles" + import_playbook: playbook-stress-init-redis.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 4: Monitoring and Management Services Setup +# Starts monitoring, enforcer, and simulation processes. +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 4.1: Import playbook to manage monitoring and enforcer processes" + import_playbook: playbook-stress-manage-processes.yml + +- name: "PHASE 4.2: Start monitoring and enforcer services" + hosts: master + gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + start_monitor: true + start_enforcer: true + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Ensure tmux is installed + ansible.builtin.apt: + name: tmux + state: present + become: yes + + - name: Start profile monitoring in tmux session + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + tmux new-session -d -s stress-monitor \ + "set -a && . ./.env && set +a && \ + ./bin/ytops-client profile list \ + --auth-env sim_auth \ + --download-env sim_download \ + --live \ + --no-blink \ + --show-reasons" + when: start_monitor | default(false) | bool + + - name: Start policy enforcer in tmux session + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + tmux new-session -d -s stress-enforcer \ + "set -a && . ./.env && set +a && \ + ./bin/ytops-client policy-enforcer \ + --policy policies/8_unified_simulation_enforcer.yaml \ + --live" + when: start_enforcer | default(false) | bool + + - name: List active tmux sessions + ansible.builtin.shell: + cmd: tmux list-sessions + register: tmux_sessions + changed_when: false + + - name: Display active sessions + ansible.builtin.debug: + msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}" + +# ------------------------------------------------------------------------------------------------- +# PHASE 5: Simulation Workload Generation (Optional - can be run manually) +# These playbooks are available but not automatically started by default. +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 5.1: Note about simulation workload generation" + hosts: localhost + gather_facts: no + tasks: + - name: Display note about simulation playbooks + ansible.builtin.debug: + msg: | + Simulation workload generation playbooks are available: + - ansible/playbook-stress-auth-generator.yml + - ansible/playbook-stress-download-simulation.yml + + To start simulations manually, run: + ansible-playbook ansible/playbook-stress-auth-generator.yml \ + -e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3" + + ansible-playbook ansible/playbook-stress-download-simulation.yml \ + -e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \ + --limit airflow_workers[0] diff --git a/ansible/playbook-full-install.yml b/ansible/playbook-full-install.yml new file mode 100644 index 0000000..f6d33d5 --- /dev/null +++ b/ansible/playbook-full-install.yml @@ -0,0 +1,404 @@ +--- +# This playbook provides a complete installation for fresh nodes. +# It can install either master or worker roles, or both on the same machine. +# +# Usage examples: +# # Install everything on all nodes +# ansible-playbook ansible/playbook-full-install.yml +# +# # Install only on workers +# ansible-playbook ansible/playbook-full-install.yml --limit workers +# +# # Install only on master +# ansible-playbook ansible/playbook-full-install.yml --limit master +# +# # Install both roles on a single machine +# ansible-playbook ansible/playbook-full-install.yml --limit specific-host -e "install_master=true install_worker=true" + +# ------------------------------------------------------------------------------------------------- +# PHASE 1: Base System Configuration +# Ensures all nodes have the necessary base packages, user configurations, and Docker installed. +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 1: Import base system setup playbook" + import_playbook: playbook-base-system.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 2: Generate Environment Configuration +# Creates .env files needed by all subsequent steps +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 2: Generate .env configuration" + import_playbook: playbook-stress-generate-env.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 3: Docker Network Setup +# Ensures the shared Docker network exists before building containers +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 3: Ensure Docker network exists" + hosts: all + gather_facts: no + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Create shared Docker network + community.docker.docker_network: + name: "{{ docker_network_name }}" + driver: bridge + become: yes + +# ------------------------------------------------------------------------------------------------- +# PHASE 4: Build yt-dlp Docker Image +# Builds the yt-dlp container from bin/ directory +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 4: Build yt-dlp Docker image" + hosts: all + gather_facts: no + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}" + + - name: Ensure bin directory exists + ansible.builtin.file: + path: "{{ base_dir }}/bin" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Check if Dockerfile exists in bin directory + ansible.builtin.stat: + path: "{{ base_dir }}/bin/Dockerfile" + register: dockerfile_stat + + - name: Build yt-dlp Docker image if Dockerfile exists + community.docker.docker_image: + name: yt-dlp-custom + tag: latest + source: build + build: + path: "{{ base_dir }}/bin" + pull: yes + state: present + force_source: yes + become: yes + when: dockerfile_stat.stat.exists + + - name: Display message if Dockerfile not found + ansible.builtin.debug: + msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build" + when: not dockerfile_stat.stat.exists + +# ------------------------------------------------------------------------------------------------- +# PHASE 5: Sync Code and Install Dependencies +# Copies application code and installs Python dependencies +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 5.1: Sync application code" + import_playbook: playbook-stress-sync-code.yml + +- name: "PHASE 5.2: Install Python dependencies" + import_playbook: playbook-stress-install-deps.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 6: Deploy Shadowsocks Proxies +# Configures and starts proxy services +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 6: Deploy proxy services" + import_playbook: playbook-proxies.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 7: Install bgutils +# Note: Currently bgutils is deployed on master via docker-compose +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 7: Install bgutils" + import_playbook: playbook-install-bgutils.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 8: Master-Specific Services Setup +# Starts Redis, MinIO, and other master-only services +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 8: Master Node Services Setup" + hosts: master + gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Configure system performance and kernel settings + ansible.builtin.copy: + src: "configs/etc/sysctl.d/99-system-limits.conf" + dest: "/etc/sysctl.d/99-system-limits.conf" + owner: root + group: root + mode: '0644' + become: yes + register: sysctl_config_copy + + - name: Apply sysctl settings + ansible.builtin.command: sysctl --system + become: yes + when: sysctl_config_copy.changed + + - name: Ensure MinIO data directory exists + ansible.builtin.file: + path: "{{ airflow_master_dir }}/minio-data" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Template Docker Compose file for master services + ansible.builtin.template: + src: templates/docker-compose.stress-master.j2 + dest: "{{ airflow_master_dir }}/docker-compose.stress.yml" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0644' + become: yes + + - name: Stop and remove existing containers before starting services + ansible.builtin.shell: + cmd: | + docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + become: yes + changed_when: false + ignore_errors: yes + + - name: Start master services (Redis, MinIO) + community.docker.docker_compose_v2: + project_src: "{{ airflow_master_dir }}" + files: + - docker-compose.stress.yml + state: present + remove_orphans: true + become: yes + + - name: Wait for MinIO service to be ready + ansible.builtin.wait_for: + host: "{{ hostvars[inventory_hostname].ansible_host }}" + port: 9000 + delay: 5 + timeout: 60 + delegate_to: localhost + + - name: Download MinIO Client (mc) if not present + ansible.builtin.command: + cmd: wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc + creates: /usr/local/bin/mc + become: yes + + - name: Ensure MinIO Client (mc) is executable + ansible.builtin.file: + path: /usr/local/bin/mc + mode: '0755' + become: yes + + - name: Configure mc alias for local MinIO + ansible.builtin.command: > + mc alias set local http://localhost:9000 {{ vault_s3_access_key_id }} {{ vault_s3_secret_access_key }} + become: yes + become_user: "{{ ansible_user }}" + changed_when: false + environment: + HOME: "/home/{{ ansible_user }}" + + - name: Ensure S3 buckets exist in MinIO using mc + ansible.builtin.command: > + mc mb local/{{ item }} + loop: + - "stress-inputs" + - "stress-jsons" + become: yes + become_user: "{{ ansible_user }}" + register: mc_mb_result + failed_when: > + mc_mb_result.rc != 0 and + "already exists" not in mc_mb_result.stderr + changed_when: mc_mb_result.rc == 0 + environment: + HOME: "/home/{{ ansible_user }}" + +# ------------------------------------------------------------------------------------------------- +# PHASE 9: Worker-Specific Services Setup +# Starts worker-only services if needed +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 9: Worker Node Services Setup" + hosts: workers + gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Template Docker Compose file for worker services + ansible.builtin.template: + src: templates/docker-compose.stress-master.j2 + dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0644' + become: yes + + - name: Stop and remove existing containers before starting services + ansible.builtin.shell: + cmd: | + docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f + become: yes + changed_when: false + ignore_errors: yes + + - name: Start worker services + community.docker.docker_compose_v2: + project_src: "{{ airflow_worker_dir }}" + files: + - docker-compose.stress.yml + state: present + remove_orphans: true + become: yes + +# ------------------------------------------------------------------------------------------------- +# PHASE 10: Shared Storage Setup (s3fs) +# Mounts S3 buckets on all nodes +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 10: Shared Storage Setup (s3fs)" + hosts: master:workers + gather_facts: no + vars: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" + + - name: Mount S3 buckets via s3fs + block: + - name: Install s3fs for mounting S3 buckets + ansible.builtin.apt: + name: s3fs + state: present + become: yes + + - name: Configure s3fs credentials + ansible.builtin.copy: + content: "{{ vault_s3_access_key_id }}:{{ vault_s3_secret_access_key }}" + dest: "/home/{{ ansible_user }}/.passwd-s3fs" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0600' + become: yes + + - name: Check if mount points are already mounted + ansible.builtin.shell: + cmd: "mount | grep -q '{{ item.path }}'" + loop: + - { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' } + - { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' } + register: mount_check + changed_when: false + failed_when: false + + - name: Ensure mount point directories exist (only if not mounted) + ansible.builtin.file: + path: "{{ item.item.path }}" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + loop: "{{ mount_check.results }}" + when: item.rc != 0 + become: yes + + - name: Mount S3 buckets for stress testing + ansible.posix.mount: + src: "s3fs#{{ item.bucket }}" + path: "{{ item.path }}" + fstype: fuse + opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs" + state: mounted + loop: + - { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' } + - { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' } + become: yes + +# ------------------------------------------------------------------------------------------------- +# PHASE 11: Initialize Redis (Master Only) +# Sets up profiles and policies in Redis +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 11: Initialize Redis profiles" + import_playbook: playbook-stress-init-redis.yml + +# ------------------------------------------------------------------------------------------------- +# PHASE 12: Final Status and Next Steps +# ------------------------------------------------------------------------------------------------- +- name: "PHASE 12: Installation Complete" + hosts: localhost + gather_facts: no + tasks: + - name: Display installation completion message + ansible.builtin.debug: + msg: | + ======================================== + Full installation complete! + ======================================== + + Next steps: + + 1. Start monitoring and enforcer (on master): + ansible-playbook ansible/playbook-stress-manage-processes.yml \ + -e "start_monitor=true start_enforcer=true" + + 2. Start auth generator (on master): + ansible-playbook ansible/playbook-stress-auth-generator.yml \ + -e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3" + + 3. Start download simulation (on workers): + ansible-playbook ansible/playbook-stress-download-simulation.yml \ + -e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \ + --limit workers + + 4. Check status: + ansible-playbook ansible/playbook-stress-control.yml -e "action=status" + + 5. Monitor profiles: + ansible-playbook ansible/playbook-stress-control.yml -e "action=profile-status" diff --git a/ansible/playbook-install-bgutils.yml b/ansible/playbook-install-bgutils.yml new file mode 100644 index 0000000..a59b3de --- /dev/null +++ b/ansible/playbook-install-bgutils.yml @@ -0,0 +1,18 @@ +--- +- name: "UTIL-SETUP: Install and configure bgutils container on workers" + hosts: workers + gather_facts: yes + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Note that bgutil-provider is now on master + ansible.builtin.debug: + msg: "The bgutil-provider service is now deployed on the master node via docker-compose and is no longer deployed on workers." diff --git a/ansible/playbook-install-cleanup-media.yml b/ansible/playbook-install-cleanup-media.yml new file mode 100644 index 0000000..129d541 --- /dev/null +++ b/ansible/playbook-install-cleanup-media.yml @@ -0,0 +1,165 @@ +--- +- name: "UTIL-SETUP: Install media cleanup script and configure cron" + hosts: all + gather_facts: yes + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" + + - name: Ensure cleanup script directory exists + ansible.builtin.file: + path: "{{ base_dir }}/bin" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Copy cleanup_media.py script + ansible.builtin.copy: + src: "{{ playbook_dir }}/../yt-ops-services-debug/cleanup_media.py" + dest: "{{ base_dir }}/bin/cleanup_media.py" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Install s5cmd for S3 uploads + block: + - name: Check if s5cmd is already installed + ansible.builtin.stat: + path: /usr/local/bin/s5cmd + register: s5cmd_binary + + - name: Download and install s5cmd + block: + - name: Create temporary directory for s5cmd download + ansible.builtin.tempfile: + state: directory + suffix: s5cmd + register: s5cmd_temp_dir + + - name: Download s5cmd + ansible.builtin.get_url: + url: "https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz" + dest: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz" + mode: '0644' + + - name: Extract s5cmd + ansible.builtin.unarchive: + src: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz" + dest: "{{ s5cmd_temp_dir.path }}" + remote_src: yes + + - name: Install s5cmd to /usr/local/bin + ansible.builtin.copy: + src: "{{ s5cmd_temp_dir.path }}/s5cmd" + dest: /usr/local/bin/s5cmd + mode: '0755' + remote_src: yes + + - name: Clean up temporary directory + ansible.builtin.file: + path: "{{ s5cmd_temp_dir.path }}" + state: absent + when: not s5cmd_binary.stat.exists + become: yes + + - name: Ensure log directory exists + ansible.builtin.file: + path: "/var/log" + state: directory + owner: root + group: root + mode: '0755' + become: yes + + - name: Create wrapper script to source .env before running cleanup + ansible.builtin.copy: + content: | + #!/bin/bash + # Wrapper script to run cleanup_media.py with environment variables from .env + + set -e + + BASE_DIR="{{ base_dir }}" + + # Source .env file if it exists + if [ -f "${BASE_DIR}/.env" ]; then + set -a + source "${BASE_DIR}/.env" + set +a + fi + + # Determine cleanup mode based on environment variable or default + CLEANUP_MODE="${CLEANUP_MODE:-{{ cleanup_settings.mode | default('s3-upload') }}}" + + # Run cleanup script + cd "${BASE_DIR}" + + if [ "$CLEANUP_MODE" = "s3-upload" ]; then + # S3 upload mode - uploads to S3 then deletes + exec python3 "${BASE_DIR}/bin/cleanup_media.py" \ + --target-dir "${BASE_DIR}/run" \ + --target-dir "${BASE_DIR}/downloadfiles" \ + --max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \ + --log-file /var/log/cleanup_media.log \ + --s3-upload \ + --s3-bucket "${S3_BUCKET:-stress-media-archive}" \ + --s3-prefix "archived-media/$(hostname)" \ + --s5cmd-path /usr/local/bin/s5cmd + else + # Simple cleanup mode - just truncate and rename + exec python3 "${BASE_DIR}/bin/cleanup_media.py" \ + --target-dir "${BASE_DIR}/run" \ + --target-dir "${BASE_DIR}/downloadfiles" \ + --max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \ + --log-file /var/log/cleanup_media.log + fi + dest: "{{ base_dir }}/bin/cleanup_media_wrapper.sh" + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Configure cron job for media cleanup + ansible.builtin.cron: + name: "Media cleanup - {{ base_dir }}" + minute: "0" + hour: "*" + job: "{{ base_dir }}/bin/cleanup_media_wrapper.sh 2>&1 | logger -t cleanup_media" + user: "{{ ansible_user }}" + state: "{{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }}" + become: yes + + - name: Display installation summary + ansible.builtin.debug: + msg: | + Media cleanup script installed successfully on {{ inventory_hostname }} + + Configuration from cluster.green.yml: + - Base directory: {{ base_dir }} + - Enabled: {{ cleanup_settings.enabled | default(true) }} + - Cleanup mode: {{ cleanup_settings.mode | default('s-upload') }} + - Max age: {{ cleanup_settings.max_age_seconds | default(3600) }} seconds + - Cron schedule: Every hour (0 * * * *) + - Cron job state: {{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }} + - Log file: /var/log/cleanup_media.log + + Note: You can override the cleanup mode for a single run by setting the + CLEANUP_MODE environment variable before executing the wrapper script. + e.g., CLEANUP_MODE=cleanup {{ base_dir }}/bin/cleanup_media_wrapper.sh + + To view logs: + tail -f /var/log/cleanup_media.log diff --git a/ansible/playbook-proxies.yml b/ansible/playbook-proxies.yml index 61b6594..749889a 100644 --- a/ansible/playbook-proxies.yml +++ b/ansible/playbook-proxies.yml @@ -1,7 +1,17 @@ --- - name: Deploy Shadowsocks-Rust Proxy Configurations - hosts: all + hosts: workers gather_facts: yes + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Deploy Shadowsocks-Rust proxy services block: @@ -55,11 +65,18 @@ path: /srv/shadowsocks-rust/docker-compose.yaml state: absent - - name: Force stop and remove known proxy containers to prevent conflicts - community.docker.docker_container: - name: "{{ item.key }}" - state: absent + - name: Find and stop any container using the target proxy ports + ansible.builtin.shell: + cmd: | + container_id=$(docker ps -aq --filter "publish={{ item.value.local_port }}") + if [ -n "$container_id" ]; then + echo "Found container ${container_id} using port {{ item.value.local_port }}. Stopping and removing it." + docker stop "${container_id}" >/dev/null 2>&1 || true + docker rm -f "${container_id}" >/dev/null 2>&1 || true + fi loop: "{{ shadowsocks_proxies | dict2items }}" + register: stop_conflicting_containers + changed_when: "'Stopping and removing it' in stop_conflicting_containers.stdout" loop_control: label: "{{ item.key }}" diff --git a/ansible/playbook-stress-auth-generator.yml b/ansible/playbook-stress-auth-generator.yml new file mode 100644 index 0000000..cda41c2 --- /dev/null +++ b/ansible/playbook-stress-auth-generator.yml @@ -0,0 +1,95 @@ +--- +- name: "STRESS-SETUP: Manage auth simulation generator" + hosts: workers + gather_facts: no + vars: + tmux_session_auth_gen: "stress-auth-{{ (profile_prefix | default('default')) | replace(',', '-') }}" + auth_policy: "policies/12_queue_auth_simulation.yaml" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_worker_dir }}" + + - name: Validate profile_prefix is provided when starting or stopping + ansible.builtin.fail: + msg: "profile_prefix is required when start_generator=true or stop_generator=true" + when: + - (start_generator | default(false) | bool or stop_generator | default(false) | bool) + - profile_prefix is not defined + + - name: "Display policy being used for auth generator" + ansible.builtin.debug: + msg: "Using auth generator policy: {{ auth_policy }}" + when: start_generator | default(false) | bool + + - name: Manage auth generator process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "{{ tmux_session_auth_gen }}" + working_dir: "{{ base_dir }}" + command_to_run: > + ./bin/ytops-client stress-policy + --policy {{ auth_policy }} + {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} + {% if auth_min_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_min_seconds={{ auth_min_seconds }}'{% endif %} + {% if auth_max_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_max_seconds={{ auth_max_seconds }}'{% endif %} + {% if batch_size is defined %}--set 'queue_policy.batch_size={{ batch_size }}'{% endif %} + {% if create_download_tasks is defined %}--set 'queue_policy.create_download_tasks={{ create_download_tasks }}'{% endif %} + {% if formats_to_download is defined %}--set 'queue_policy.formats_to_download={{ formats_to_download }}'{% endif %} + {% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %} + --profile-prefix {{ profile_prefix }} + process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ auth_policy }}.*--profile-prefix {{ profile_prefix }}" + start_process: "{{ start_generator | default(false) | bool }}" + stop_process: "{{ stop_generator | default(false) | bool }}" + check_status: "{{ vars.check_status | default(false) | bool }}" + + - name: List active tmux sessions + ansible.builtin.shell: + cmd: tmux list-sessions 2>/dev/null || true + register: tmux_sessions + changed_when: false + + - name: Display active sessions + ansible.builtin.debug: + msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}" + + - name: Check tmux session output for errors + block: + - name: Wait for a moment for the process to start + ansible.builtin.pause: + seconds: 2 + + - name: Capture tmux pane content + ansible.builtin.shell: + cmd: "tmux capture-pane -p -t {{ tmux_session_auth_gen }}" + register: tmux_output + changed_when: false + ignore_errors: true + + - name: Display tmux pane content if session exists + ansible.builtin.debug: + msg: "Initial output from tmux session '{{ tmux_session_auth_gen }}':" + when: tmux_output.rc == 0 + + - name: Show output lines if session exists + ansible.builtin.debug: + var: tmux_output.stdout_lines + when: tmux_output.rc == 0 + + - name: Report if session not found + ansible.builtin.debug: + msg: "Tmux session '{{ tmux_session_auth_gen }}' was not found. It may have exited immediately upon starting." + when: tmux_output.rc != 0 + + when: start_generator | default(false) | bool diff --git a/ansible/playbook-stress-control.yml b/ansible/playbook-stress-control.yml new file mode 100644 index 0000000..0bdfb9f --- /dev/null +++ b/ansible/playbook-stress-control.yml @@ -0,0 +1,288 @@ +--- +- name: "STRESS-SETUP: Unified control for stress test processes" + hosts: all + gather_facts: no + vars: + # Default action is status check + action: "status" + setup_policy: "policies/6_profile_setup_policy.yaml" + enforcer_policy: "policies/8_unified_simulation_enforcer.yaml" + master_only_actions: + - "check-enforcer" + - "profile-status" + - "run-command" + - "cleanup-profiles" + - "restart-monitoring" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + when: action not in master_only_actions or inventory_hostname in groups['master'] + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + when: action not in master_only_actions or inventory_hostname in groups['master'] + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" + when: action not in master_only_actions or inventory_hostname in groups['master'] + + - name: Check running processes (status action) + ansible.builtin.shell: + cmd: | + echo "=== Process status on {{ inventory_hostname }} ===" + echo "1. Tmux sessions:" + tmux list-sessions 2>/dev/null || echo "No tmux sessions" + echo "" + echo "2. ytops-client processes:" + ps aux | grep -E "ytops-client.*(profile|policy-enforcer|stress-policy)" | grep -v grep || echo "No ytops-client processes" + echo "" + echo "3. Python processes related to stress test:" + ps aux | grep -E "python.*(ytops|stress)" | grep -v grep || echo "No related python processes" + register: status_output + changed_when: false + when: action == "status" + + - name: Display status + ansible.builtin.debug: + msg: "{{ status_output.stdout_lines }}" + when: action == "status" + + - name: Check enforcer status (check-enforcer action) + block: + - name: Check for enforcer tmux session and process + ansible.builtin.shell: + cmd: | + echo "Enforcer status on {{ inventory_hostname }}:" + if tmux has-session -t stress-enforcer 2>/dev/null; then + echo " - Tmux session 'stress-enforcer' is RUNNING." + ps aux | grep -E "ytops-client.*policy-enforcer" | grep -v grep || echo " - WARNING: Tmux session exists, but no matching process found." + else + echo " - Tmux session 'stress-enforcer' is NOT RUNNING." + fi + register: enforcer_status + changed_when: false + - name: Display enforcer status + ansible.builtin.debug: + msg: "{{ enforcer_status.stdout_lines }}" + when: action == "check-enforcer" + delegate_to: "{{ groups['master'][0] }}" + run_once: true + + - name: Run ytops-client profile list on master (profile-status action) + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + if [ -f .env ]; then + set -a && . ./.env && set +a + fi + timeout 10 ./bin/ytops-client profile list \ + --auth-env sim_auth \ + --download-env sim_download 2>&1 + register: profile_list_output + changed_when: false + when: + - action == "profile-status" + - inventory_hostname in groups['master'] + delegate_to: "{{ groups['master'][0] }}" + run_once: true + + - name: Show profile list output lines + ansible.builtin.debug: + var: profile_list_output.stdout_lines + when: + - action == "profile-status" + - profile_list_output is defined + - inventory_hostname in groups['master'] + + - name: Run custom ytops-client command on master (run-command action) + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + if [ -f .env ]; then + set -a && . ./.env && set +a + fi + {{ command_to_run }} + register: command_output + changed_when: false + when: + - action == "run-command" + - inventory_hostname in groups['master'] + - command_to_run is defined + delegate_to: "{{ groups['master'][0] }}" + run_once: true + + - name: Display command output + ansible.builtin.debug: + msg: "Command output:" + when: + - action == "run-command" + - command_output is defined + + - name: Show command output lines + ansible.builtin.debug: + var: command_output.stdout_lines + when: + - action == "run-command" + - command_output is defined + + - name: "Display policy being used for profile cleanup" + ansible.builtin.debug: + msg: "Using setup policy for cleanup: {{ setup_policy }}" + when: + - action == "cleanup-profiles" + - inventory_hostname in groups['master'] + delegate_to: "{{ groups['master'][0] }}" + run_once: true + + - name: Cleanup profiles on master (cleanup-profiles action) + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + if [ -f .env ]; then + set -a && . ./.env && set +a + fi + ./bin/ytops-client setup-profiles \ + --policy {{ setup_policy }} \ + --cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %} + register: cleanup_output + changed_when: false + when: + - action == "cleanup-profiles" + - inventory_hostname in groups['master'] + delegate_to: "{{ groups['master'][0] }}" + run_once: true + + - name: Display cleanup output + ansible.builtin.debug: + msg: "Cleanup output:" + when: + - action == "cleanup-profiles" + - cleanup_output is defined + + - name: Show cleanup output lines + ansible.builtin.debug: + var: cleanup_output.stdout_lines + when: + - action == "cleanup-profiles" + - cleanup_output is defined + + - name: Stop all stress test processes on all nodes (stop-all action) + block: + - name: Kill all tmux sessions starting with 'stress-' + ansible.builtin.shell: + cmd: | + for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do + tmux kill-session -t "$session" + done || true + # Failsafe: kill any lingering tmux server processes for stress sessions + TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}') + if [ -n "$TMUX_PIDS_TO_KILL" ]; then + kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true + fi + ignore_errors: yes + changed_when: false + + - name: Kill all ytops-client and related python processes + ansible.builtin.shell: + cmd: | + # Gracefully terminate processes by pattern + ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true + ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true + + sleep 1 # Wait for graceful shutdown + + # Force kill any remaining processes + ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + ignore_errors: yes + changed_when: false + + when: action == "stop-all" + + - name: Stop processes on targeted nodes only (stop-nodes action) + block: + - name: Kill all tmux sessions starting with 'stress-' on this node + ansible.builtin.shell: + cmd: | + for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do + tmux kill-session -t "$session" + done || true + # Failsafe: kill any lingering tmux server processes for stress sessions + TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}') + if [ -n "$TMUX_PIDS_TO_KILL" ]; then + kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true + fi + ignore_errors: yes + changed_when: false + + - name: Kill all ytops-client and related python processes on this node + ansible.builtin.shell: + cmd: | + # Gracefully terminate processes by pattern + ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true + ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true + + sleep 1 # Wait for graceful shutdown + + # Force kill any remaining processes + ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + ignore_errors: yes + changed_when: false + + when: action == "stop-nodes" + + - name: Restart monitoring and enforcer (restart-monitoring action) + block: + - name: Stop monitor process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "stress-monitor" + process_grep_pattern: "ytops-client.*profile.*list" + stop_process: true + + - name: Stop enforcer process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "stress-enforcer" + process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}" + stop_process: true + + - name: Start monitor process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "stress-monitor" + working_dir: "{{ airflow_master_dir }}" + command_to_run: > + ./bin/ytops-client profile list + --auth-env sim_auth + --download-env sim_download + --live + --no-blink + --show-reasons + process_grep_pattern: "ytops-client.*profile.*list" + start_process: true + + - name: Start enforcer process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "stress-enforcer" + working_dir: "{{ airflow_master_dir }}" + command_to_run: > + ./bin/ytops-client policy-enforcer + --policy {{ enforcer_policy }} + --live + process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}" + start_process: true + when: + - action == "restart-monitoring" + - inventory_hostname == groups['master'][0] diff --git a/ansible/playbook-stress-download-simulation.yml b/ansible/playbook-stress-download-simulation.yml new file mode 100644 index 0000000..19134aa --- /dev/null +++ b/ansible/playbook-stress-download-simulation.yml @@ -0,0 +1,93 @@ +--- +- name: "STRESS-SETUP: Manage download simulation" + hosts: workers + gather_facts: no + vars: + tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}" + download_policy: "policies/11_direct_docker_download_simulation.yaml" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_worker_dir }}" + + - name: Validate profile_prefix is provided when starting or stopping + ansible.builtin.fail: + msg: "profile_prefix is required when start_download=true or stop_download=true" + when: + - (start_download | default(false) | bool or stop_download | default(false) | bool) + - profile_prefix is not defined + + - name: "Display policy being used for download simulation" + ansible.builtin.debug: + msg: "Using download simulation policy: {{ download_policy }}" + when: start_download | default(false) | bool + + - name: Manage download simulation process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "{{ tmux_session_download }}" + working_dir: "{{ base_dir }}" + command_to_run: > + ./bin/ytops-client stress-policy + --policy {{ download_policy }} + {% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %} + {% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %} + {% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %} + {% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %} + {% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %} + --profile-prefix {{ profile_prefix }} + process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}" + start_process: "{{ start_download | default(false) | bool }}" + stop_process: "{{ stop_download | default(false) | bool }}" + check_status: "{{ vars.check_status | default(false) | bool }}" + + - name: List active tmux sessions + ansible.builtin.shell: + cmd: tmux list-sessions 2>/dev/null || true + register: tmux_sessions + changed_when: false + + - name: Display active sessions + ansible.builtin.debug: + msg: "Active tmux sessions on {{ inventory_hostname }}: {{ tmux_sessions.stdout_lines }}" + + - name: Check tmux session output for errors + block: + - name: Wait for a moment for the process to start + ansible.builtin.pause: + seconds: 2 + + - name: Capture tmux pane content + ansible.builtin.shell: + cmd: "tmux capture-pane -p -t {{ tmux_session_download }}" + register: tmux_output + changed_when: false + ignore_errors: true + + - name: Display tmux pane content if session exists + ansible.builtin.debug: + msg: "Initial output from tmux session '{{ tmux_session_download }}':" + when: tmux_output.rc == 0 + + - name: Show output lines if session exists + ansible.builtin.debug: + var: tmux_output.stdout_lines + when: tmux_output.rc == 0 + + - name: Report if session not found + ansible.builtin.debug: + msg: "Tmux session '{{ tmux_session_download }}' was not found. It may have exited immediately upon starting." + when: tmux_output.rc != 0 + + when: start_download | default(false) | bool diff --git a/ansible/playbook-stress-generate-env.yml b/ansible/playbook-stress-generate-env.yml index bf69fdb..0acdee5 100644 --- a/ansible/playbook-stress-generate-env.yml +++ b/ansible/playbook-stress-generate-env.yml @@ -3,12 +3,19 @@ hosts: all gather_facts: no vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Define base directory for node ansible.builtin.set_fact: - base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" - name: Create .env file for stress test environment ansible.builtin.template: @@ -18,15 +25,3 @@ group: "{{ deploy_group }}" mode: '0644' become: yes - - - name: Ensure REDIS_PORT is set in .env file - ansible.builtin.lineinfile: - path: "{{ base_dir }}/.env" - line: "REDIS_PORT={{ redis_port }}" - regexp: "^REDIS_PORT=" - state: present - create: yes - owner: "{{ ansible_user }}" - group: "{{ deploy_group }}" - mode: '0644' - become: yes diff --git a/ansible/playbook-stress-init-redis.yml b/ansible/playbook-stress-init-redis.yml new file mode 100644 index 0000000..e4e90f3 --- /dev/null +++ b/ansible/playbook-stress-init-redis.yml @@ -0,0 +1,61 @@ +--- +- name: "STRESS-SETUP: Initialize Redis with profiles and policies" + hosts: master + gather_facts: no + vars: + setup_policy: "policies/6_profile_setup_policy.yaml" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Check if Redis is running + ansible.builtin.shell: + cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG" + register: redis_check + ignore_errors: yes + changed_when: false + + - name: Ensure Redis is accessible + ansible.builtin.fail: + msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}" + when: redis_check.rc != 0 + + - name: Stop any running ytops-client processes on master + ansible.builtin.shell: + cmd: ps aux | grep "[y]tops-client" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + changed_when: false + + - name: "Display policy being used for Redis initialization" + ansible.builtin.debug: + msg: "Using setup policy: {{ setup_policy }}" + + - name: Initialize Redis profiles and policies + ansible.builtin.shell: + cmd: | + cd {{ airflow_master_dir }} + ./bin/ytops-client setup-profiles \ + --policy {{ setup_policy }} \ + --cleanup-all + environment: + REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}" + REDIS_PORT: "{{ redis_port }}" + REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}" + register: init_result + changed_when: init_result.rc == 0 + + - name: Display initialization result + ansible.builtin.debug: + msg: "Redis profile initialization completed successfully" + when: init_result.rc == 0 + + - name: Handle initialization failure + ansible.builtin.fail: + msg: "Failed to initialize Redis profiles: {{ init_result.stderr }}" + when: init_result.rc != 0 diff --git a/ansible/playbook-stress-install-deps.yml b/ansible/playbook-stress-install-deps.yml index 2e24239..663a421 100644 --- a/ansible/playbook-stress-install-deps.yml +++ b/ansible/playbook-stress-install-deps.yml @@ -3,9 +3,15 @@ hosts: all gather_facts: yes vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" - name: Ensure python3-pip is installed block: - name: Install prerequisites for managing repositories @@ -27,17 +33,22 @@ become: yes tasks: - - name: Install required Python packages + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" + + - name: Install required Python packages from requirements.txt ansible.builtin.pip: - name: - - python-dotenv - - aria2p - - tabulate - - redis - - PyYAML - - aiothrift - - PySocks - state: present + requirements: "{{ base_dir }}/ytops_client/requirements.txt" + extra_args: "--ignore-installed" + become: yes + environment: + PIP_BREAK_SYSTEM_PACKAGES: "1" + + - name: Explicitly install the thrift package + ansible.builtin.pip: + name: thrift + extra_args: "--ignore-installed" become: yes environment: PIP_BREAK_SYSTEM_PACKAGES: "1" diff --git a/ansible/playbook-stress-lifecycle.yml b/ansible/playbook-stress-lifecycle.yml new file mode 100644 index 0000000..8fdd7ee --- /dev/null +++ b/ansible/playbook-stress-lifecycle.yml @@ -0,0 +1,113 @@ +--- +- name: "STRESS-SETUP: Manage full worker lifecycle based on inventory" + hosts: workers + gather_facts: no + vars: + # Default action + action: "status" # Available actions: start, stop, status + + tasks: + - name: "Start all configured generators and simulators" + when: action == "start" + block: + - name: "Set combined profile prefixes string" + ansible.builtin.set_fact: + combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" + when: profile_prefixes is defined and profile_prefixes | length > 0 + + - name: "Start single auth generator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "start_generator=true" + -e "profile_prefix={{ combined_prefixes }}" + {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + {% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %} + {% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %} + {% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %} + {% if create_download_tasks is defined %}-e "create_download_tasks={{ create_download_tasks }}"{% endif %} + {% if formats_to_download is defined %}-e "formats_to_download={{ formats_to_download }}"{% endif %} + delegate_to: localhost + changed_when: true + when: profile_prefixes is defined and profile_prefixes | length > 0 + + - name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "start_download=true" + -e "profile_prefix={{ combined_prefixes }}" + {% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %} + {% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %} + {% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %} + {% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %} + delegate_to: localhost + changed_when: true + when: profile_prefixes is defined and profile_prefixes | length > 0 + + - name: "Stop all worker generators and simulators" + when: action == "stop" + block: + - name: Kill all tmux sessions starting with 'stress-' on this worker + ansible.builtin.shell: + cmd: | + for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do + tmux kill-session -t "$session" + done || true + ignore_errors: yes + changed_when: false + + - name: Kill all ytops-client processes on this worker + ansible.builtin.shell: + cmd: | + # Gracefully terminate + ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true + sleep 0.5 + # Force kill + ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true + ignore_errors: yes + changed_when: false + + - name: "Check status of all configured generators and simulators" + when: action == "status" + block: + - name: "Set combined profile prefixes string" + ansible.builtin.set_fact: + combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}" + when: profile_prefixes is defined and profile_prefixes | length > 0 + + - name: "Check single auth generator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "check_status=true" + -e "profile_prefix={{ combined_prefixes }}" + delegate_to: localhost + changed_when: false + when: profile_prefixes is defined and profile_prefixes | length > 0 + register: auth_status_check + + - name: "Display auth generator status for {{ inventory_hostname }}" + ansible.builtin.debug: + var: auth_status_check.stdout_lines + when: auth_status_check is defined + + - name: "Check single download simulator for all profiles: {{ combined_prefixes | default('none') }}" + ansible.builtin.command: >- + ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml + -i {{ inventory_file }} + --limit {{ inventory_hostname }} + -e "check_status=true" + -e "profile_prefix={{ combined_prefixes }}" + delegate_to: localhost + changed_when: false + when: profile_prefixes is defined and profile_prefixes | length > 0 + register: download_status_check + + - name: "Display download simulator status for {{ inventory_hostname }}" + ansible.builtin.debug: + var: download_status_check.stdout_lines + when: download_status_check is defined diff --git a/ansible/playbook-stress-manage-processes.yml b/ansible/playbook-stress-manage-processes.yml new file mode 100644 index 0000000..ed5d37e --- /dev/null +++ b/ansible/playbook-stress-manage-processes.yml @@ -0,0 +1,111 @@ +--- +- name: "STRESS-SETUP: Manage tmux sessions for monitoring and enforcer" + hosts: master + gather_facts: no + vars: + tmux_session_monitor: "stress-monitor" + tmux_session_enforcer: "stress-enforcer" + enforcer_policy: "policies/8_unified_simulation_enforcer.yaml" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Manage monitor process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "{{ tmux_session_monitor }}" + working_dir: "{{ airflow_master_dir }}" + command_to_run: > + ./bin/ytops-client profile list + --auth-env sim_auth + --download-env sim_download + --live + --no-blink + --show-reasons + process_grep_pattern: "ytops-client.*profile.*list" + start_process: "{{ start_monitor | default(false) | bool }}" + stop_process: "{{ stop_sessions | default(false) | bool or stop_monitor | default(false) | bool }}" + check_status: "{{ vars.check_status | default(false) | bool }}" + + - name: Check monitor session output for errors + block: + - name: Wait for a moment for the process to start + ansible.builtin.pause: + seconds: 2 + - name: Capture tmux pane content + ansible.builtin.shell: + cmd: "tmux capture-pane -p -t {{ tmux_session_monitor }}" + register: tmux_output + changed_when: false + ignore_errors: true + - name: Display tmux pane content if session exists + ansible.builtin.debug: + msg: "Initial output from tmux session '{{ tmux_session_monitor }}':" + when: tmux_output.rc == 0 + - name: Show output lines if session exists + ansible.builtin.debug: + var: tmux_output.stdout_lines + when: tmux_output.rc == 0 + - name: Report if session not found + ansible.builtin.debug: + msg: "Tmux session '{{ tmux_session_monitor }}' was not found." + when: tmux_output.rc != 0 + when: start_monitor | default(false) | bool + + - name: Manage enforcer process + ansible.builtin.include_tasks: + file: manage-processes-tasks.yml + vars: + tmux_session_name: "{{ tmux_session_enforcer }}" + working_dir: "{{ airflow_master_dir }}" + command_to_run: > + ./bin/ytops-client policy-enforcer + --policy {{ enforcer_policy }} + --live + process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}" + start_process: "{{ start_enforcer | default(false) | bool }}" + stop_process: "{{ stop_sessions | default(false) | bool or stop_enforcer | default(false) | bool }}" + check_status: "{{ vars.check_status | default(false) | bool }}" + + - name: Check enforcer session output for errors + block: + - name: Wait for a moment for the process to start + ansible.builtin.pause: + seconds: 2 + - name: Capture tmux pane content + ansible.builtin.shell: + cmd: "tmux capture-pane -p -t {{ tmux_session_enforcer }}" + register: tmux_output + changed_when: false + ignore_errors: true + - name: Display tmux pane content if session exists + ansible.builtin.debug: + msg: "Initial output from tmux session '{{ tmux_session_enforcer }}':" + when: tmux_output.rc == 0 + - name: Show output lines if session exists + ansible.builtin.debug: + var: tmux_output.stdout_lines + when: tmux_output.rc == 0 + - name: Report if session not found + ansible.builtin.debug: + msg: "Tmux session '{{ tmux_session_enforcer }}' was not found." + when: tmux_output.rc != 0 + when: start_enforcer | default(false) | bool + + - name: List active tmux sessions + ansible.builtin.shell: + cmd: tmux list-sessions 2>/dev/null || true + register: tmux_sessions + changed_when: false + + - name: Display active sessions + ansible.builtin.debug: + msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}" diff --git a/ansible/playbook-stress-sync-code.yml b/ansible/playbook-stress-sync-code.yml index 8ddf940..90e0b6e 100644 --- a/ansible/playbook-stress-sync-code.yml +++ b/ansible/playbook-stress-sync-code.yml @@ -2,13 +2,22 @@ - name: "STRESS-SETUP: Sync Local Code" hosts: all gather_facts: no + vars: + ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source" vars_files: - - "group_vars/all/generated_vars.stress.yml" - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" tasks: - name: Define base directory for node ansible.builtin.set_fact: - base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" - name: Ensure base directory exists for code sync ansible.builtin.file: @@ -21,7 +30,7 @@ - name: Sync python packages and directories for stress testing ansible.posix.synchronize: - src: "../ytops_client-source/{{ item }}/" + src: "{{ ytops_source_dir }}/{{ item }}/" dest: "{{ base_dir }}/{{ item }}/" rsync_opts: - "--delete" @@ -42,7 +51,7 @@ - name: Sync client utility scripts and configs ansible.posix.synchronize: - src: "../ytops_client-source/{{ item }}" + src: "{{ ytops_source_dir }}/{{ item }}" dest: "{{ base_dir }}/{{ item }}" perms: yes loop: @@ -56,3 +65,4 @@ - "ytdlp.json" become: yes become_user: "{{ ansible_user }}" + diff --git a/ansible/playbook-stress-sync-policies.yml b/ansible/playbook-stress-sync-policies.yml new file mode 100644 index 0000000..6ffa7d5 --- /dev/null +++ b/ansible/playbook-stress-sync-policies.yml @@ -0,0 +1,58 @@ +--- +- name: "STRESS-SETUP: Sync Policies and CLI Configs" + hosts: all + gather_facts: no + vars: + ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source" + vars_files: + - "group_vars/all/vault.yml" + pre_tasks: + - name: Set inventory_env fact + ansible.builtin.set_fact: + inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}" + - name: Load environment-specific variables + ansible.builtin.include_vars: "{{ item }}" + with_fileglob: + - "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml" + tasks: + - name: Define base directory for node + ansible.builtin.set_fact: + base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}" + + - name: Ensure policies directory exists + ansible.builtin.file: + path: "{{ base_dir }}/policies" + state: directory + owner: "{{ ansible_user }}" + group: "{{ deploy_group }}" + mode: '0755' + become: yes + + - name: Sync policies directory only + ansible.posix.synchronize: + src: "{{ ytops_source_dir }}/policies/" + dest: "{{ base_dir }}/policies/" + rsync_opts: + - "--delete" + - "--exclude=.DS_Store" + - "--exclude=__pycache__" + - "--exclude='*.pyc'" + recursive: yes + perms: yes + become: yes + become_user: "{{ ansible_user }}" + + - name: Sync client CLI config files + ansible.posix.synchronize: + src: "{{ ytops_source_dir }}/{{ item }}" + dest: "{{ base_dir }}/{{ item }}" + perms: yes + loop: + - "cli.auth.config" + - "cli.download.config" + become: yes + become_user: "{{ ansible_user }}" + + - name: Display sync completion + ansible.builtin.debug: + msg: "Policies and CLI configs synced to {{ base_dir }}" diff --git a/ansible/templates/.env.stress.j2 b/ansible/templates/.env.stress.j2 index 39af3d8..7fa113e 100644 --- a/ansible/templates/.env.stress.j2 +++ b/ansible/templates/.env.stress.j2 @@ -1,8 +1,16 @@ # This file is managed by Ansible for the stress test environment. # --- Network Settings --- -REDIS_HOST={{ hostvars[groups['airflow_master'][0]].ansible_host }} +REDIS_HOST={{ hostvars[groups['master'][0]].ansible_host }} +REDIS_PORT={{ vault_redis_port }} REDIS_PASSWORD={{ vault_redis_password }} +# --- S3 Storage Configuration --- +S3_ACCESS_KEY={{ vault_s3_delivery_access_key_id }} +S3_SECRET_KEY={{ vault_s3_delivery_secret_access_key }} +S3_ENDPOINT={{ vault_s3_delivery_endpoint }} +S3_BUCKET={{ vault_s3_delivery_bucket }} +AWS_REGION={{ vault_s3_delivery_aws_region }} + # --- Account Manager Configuration --- ACCOUNT_ACTIVE_DURATION_MIN=7 ACCOUNT_COOLDOWN_DURATION_MIN=30 diff --git a/ansible/templates/docker-compose.stress-master.j2 b/ansible/templates/docker-compose.stress-master.j2 index 812bcc3..1d3ef32 100644 --- a/ansible/templates/docker-compose.stress-master.j2 +++ b/ansible/templates/docker-compose.stress-master.j2 @@ -1,6 +1,7 @@ # Template for stress test master services name: "stress-services" services: +{% if inventory_hostname in groups['master'] %} redis: image: redis:7-alpine container_name: stress-redis @@ -26,7 +27,7 @@ services: command: server /data --console-address ":9001" networks: - {{ docker_network_name }} - +{% endif %} bgutil-provider: image: brainicism/bgutil-ytdlp-pot-provider container_name: bgutil-provider diff --git a/ansible/templates/shadowsocks-compose.yml.j2 b/ansible/templates/shadowsocks-compose.yml.j2 index 6ae640a..f217aca 100644 --- a/ansible/templates/shadowsocks-compose.yml.j2 +++ b/ansible/templates/shadowsocks-compose.yml.j2 @@ -11,7 +11,6 @@ services: volumes: - ./config_ssp_{{ proxy_config.local_port }}/:/etc/shadowsocks-rust/:ro networks: - - default - {{ docker_network_name }} {% endfor %} diff --git a/tools/generate-inventory.py b/tools/generate-inventory.py index c1d9da1..9ad8351 100755 --- a/tools/generate-inventory.py +++ b/tools/generate-inventory.py @@ -19,7 +19,7 @@ def generate_inventory(cluster_config, inventory_path): f.write("# Edit cluster.yml and re-run the generator instead.\n\n") # Master group - f.write("[airflow_master]\n") + f.write("[master]\n") for hostname, config in cluster_config.get('master', {}).items(): line = f"{hostname} ansible_host={config['ip']}" if 'port' in config: @@ -29,7 +29,7 @@ def generate_inventory(cluster_config, inventory_path): f.write("\n") # Workers group (handles case where workers are not defined) - f.write("[airflow_workers]\n") + f.write("[workers]\n") for hostname, config in cluster_config.get('workers', {}).items(): line = f"{hostname} ansible_host={config['ip']}" if 'port' in config: @@ -46,6 +46,11 @@ def generate_host_vars(cluster_config, host_vars_dir): print("Error: 'master' section is missing or empty in cluster config. Cannot proceed.") sys.exit(1) master_ip = list(master_nodes.values())[0]['ip'] + + # Get global vars for aliases + global_vars = cluster_config.get('global_vars', {}) + airflow_master_dir = global_vars.get('airflow_master_dir') + airflow_worker_dir = global_vars.get('airflow_worker_dir') # Get global proxy definitions shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {}) @@ -58,6 +63,8 @@ def generate_host_vars(cluster_config, host_vars_dir): # Per-node list of proxies to USE worker_proxies = config.get('proxies', []) + profile_prefixes = config.get('profile_prefixes', []) + cleanup_settings = config.get('cleanup_settings') with open(host_vars_file, 'w') as f: f.write("---\n") @@ -65,6 +72,13 @@ def generate_host_vars(cluster_config, host_vars_dir): f.write(f"master_host_ip: {master_ip}\n") f.write("redis_port: 52909\n") + # Add node-specific directory aliases for template compatibility + # The master path is needed by all nodes for the .env template. + if airflow_master_dir: + f.write(f"airflow_master: \"{airflow_master_dir}\"\n") + if hostname in cluster_config.get('workers', {}) and airflow_worker_dir: + f.write(f"airflow_dl_worker: \"{airflow_worker_dir}\"\n") + # Write the global proxy definitions for deployment if shadowsocks_proxies: f.write("shadowsocks_proxies:\n") @@ -81,6 +95,22 @@ def generate_host_vars(cluster_config, host_vars_dir): for proxy in worker_proxies: f.write(f" - \"{proxy}\"\n") + # Write worker-specific profile prefixes + if profile_prefixes: + f.write("profile_prefixes:\n") + for prefix in profile_prefixes: + f.write(f" - \"{prefix}\"\n") + + # Write worker-specific cleanup settings (overrides global) + if cleanup_settings: + f.write("cleanup_settings:\n") + if 'enabled' in cleanup_settings: + f.write(f" enabled: {str(cleanup_settings['enabled']).lower()}\n") + if 'mode' in cleanup_settings: + f.write(f" mode: \"{cleanup_settings['mode']}\"\n") + if 'max_age_seconds' in cleanup_settings: + f.write(f" max_age_seconds: {cleanup_settings['max_age_seconds']}\n") + def generate_group_vars(cluster_config, group_vars_path): """Generate group-level variables""" # Create parent directory if it doesn't exist @@ -107,11 +137,11 @@ def generate_group_vars(cluster_config, group_vars_path): generated_data = { 'master_host_ip': master_ip, 'redis_port': 52909, - 'external_access_ips': external_ips if external_ips else [], - 'hostvars': all_nodes + 'external_access_ips': external_ips if external_ips else [] } generated_data.update(global_vars) + with open(group_vars_path, 'w') as f: f.write("---\n") f.write("# This file is auto-generated by tools/generate-inventory.py\n") diff --git a/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml b/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml index 3cd6848..642466c 100644 --- a/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml +++ b/ytops_client-source/policies/10_direct_docker_auth_simulation.yaml @@ -17,7 +17,8 @@ settings: urls_file: "inputfiles/urls.rt300.txt" # The save directory MUST be inside the docker_host_mount_path for the download # simulation to be able to find the files. - save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" + # NOTE: This path is expected to be on an s3fs mount for cross-host communication. + save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation" # Settings for controlling the behavior of dummy/simulation modes. # These values can be overridden at runtime with the --set flag. @@ -88,6 +89,7 @@ direct_docker_cli_policy: docker_container_mount_path: "/config" # The mount point inside the container # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs. + # NOTE: This path should be on a fast, local disk, NOT on s3fs. docker_host_cache_path: ".cache/direct_docker_simulation" # Path inside the container where the cache is mounted. Should match HOME/.cache docker_container_cache_path: "/config/.cache" diff --git a/ytops_client-source/policies/11_direct_docker_download_simulation.yaml b/ytops_client-source/policies/11_direct_docker_download_simulation.yaml index db5665e..bfc6b9b 100644 --- a/ytops_client-source/policies/11_direct_docker_download_simulation.yaml +++ b/ytops_client-source/policies/11_direct_docker_download_simulation.yaml @@ -15,7 +15,8 @@ settings: # This directory should contain info.json files generated by an auth simulation, # like `10_direct_docker_auth_simulation`. # It MUST be inside the docker_host_mount_path. - info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" + # NOTE: This path is expected to be on an s3fs mount for cross-host communication. + info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation" #info_json_dir: "run/docker_mount/download_tasks" # Regex to extract the profile name from a task filename. The first capture # group is used. This is crucial for the task-first locking strategy. @@ -30,6 +31,8 @@ execution_control: workers: 1 - profile_prefix: "user2" workers: 1 + - profile_prefix: "user3" + workers: 1 # How long a worker should pause if it cannot find an available profile or task. worker_polling_interval_seconds: 1 @@ -86,6 +89,7 @@ direct_docker_cli_policy: docker_container_mount_path: "/config" # Path on the HOST where downloaded files will be saved. + # NOTE: This path should be on a fast, local disk, NOT on s3fs. docker_host_download_path: "downloaded_media/direct_docker_simulation" # Path inside the CONTAINER where `docker_host_download_path` is mounted. docker_container_download_path: "/downloads" diff --git a/ytops_client-source/policies/12_queue_auth_simulation.yaml b/ytops_client-source/policies/12_queue_auth_simulation.yaml index 50f2a8f..e9b4c1b 100644 --- a/ytops_client-source/policies/12_queue_auth_simulation.yaml +++ b/ytops_client-source/policies/12_queue_auth_simulation.yaml @@ -1,27 +1,82 @@ -# Policy: Queue-based Authentication Simulation via Direct Docker Exec +# Policy: Queue-based Authentication Simulation # -# This policy simulates a continuous stream of info.json fetch requests using -# the 'direct_docker_cli' mode. It pulls URLs from a Redis queue, creates a -# temporary batch file, and then calls a yt-dlp command inside a running -# Docker container. +# This policy simulates a continuous stream of info.json fetch requests. It pulls +# URLs from a Redis queue and processes them, acting as the first stage in a +# two-stage simulation. The second stage (downloading) can be handled in one +# of two ways, configured below: +# +# --- WORKFLOW 1: Queue-Auth -> File-Download --- +# - This policy creates info.json files in a shared directory (`save_info_json_dir`). +# - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml) +# watches that directory, picks up the files, and performs the downloads. +# - To enable: +# - Set `create_download_tasks: false` +# - Ensure `save_info_json_dir` points to a shared path. +# +# --- WORKFLOW 2: Queue-Auth -> Queue-Download --- +# - This policy creates download *tasks* and pushes them to another Redis queue. +# - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml) +# pulls tasks from that queue and performs the downloads. +# - To enable: +# - Set `create_download_tasks: true` +# - Configure `download_task_queue` to the correct queue name. +# - Use `download_task_granularity` to control if one task is created per-URL +# or per-format. # name: 12_queue_auth_simulation settings: mode: fetch_only - orchestration_mode: direct_docker_cli + orchestration_mode: queue_auth profile_mode: from_pool_with_lock - # The save directory MUST be inside the docker_host_mount_path. - save_info_json_dir: "run/docker_mount/fetched_info_jsons/queue_simulation" + # For Queue-Auth -> File-Download workflow: Directory to save generated info.json files. + # A file-based download worker (e.g., policy 11) will watch this directory. + # This directory MUST be inside the docker_host_mount_path. + # NOTE: This path is expected to be on an s3fs mount for cross-host communication. + save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation" execution_control: - workers: 1 + # Define worker pools for multiple user groups + worker_pools: + - profile_prefix: "user1" + workers: 1 + - profile_prefix: "user2" + workers: 1 + - profile_prefix: "user3" + workers: 1 # How long a worker should pause if it cannot find an available profile to lock. worker_polling_interval_seconds: 1 # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability. info_json_generation_policy: - profile_prefix: "user1" + # This setting tells the auth worker how many download tasks will be generated + # per successful info.json. It is used to correctly increment the + # 'pending_downloads' counter on the auth profile. + # Can be an integer, or 'from_download_policy' to automatically count formats + # from the 'download_policy.formats' setting in this same policy file. + downloads_per_url: "from_download_policy" + # (For Queue-Download workflow) Controls how download tasks are created. + # + # "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'. + # If `formats_to_download` is "140,299", two download tasks are created, and + # the 'pending_downloads' counter is incremented by 2. + # + # "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download' + # string is passed to the download worker as the format selector, but 'pending_downloads' + # is only incremented by 1 for the whole URL. + # + # --- Current Setting --- + download_task_granularity: "per_format" + # --- Alternative Setting (commented out) --- + # download_task_granularity: "per_url" + # profile_prefix is now defined per-pool in execution_control.worker_pools + # However, for queue auth mode, we need a fallback prefix + profile_prefix: "user" + +# This section is needed for the 'downloads_per_url: from_download_policy' setting. +# It should mirror the formats being used by the download simulation. +download_policy: + formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140" direct_docker_cli_policy: # Which simulation environment's profiles to use for locking. @@ -50,6 +105,7 @@ direct_docker_cli_policy: docker_container_mount_path: "/config" # The mount point inside the container # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs. + # NOTE: This path should be on a fast, local disk, NOT on s3fs. docker_host_cache_path: ".cache/queue_auth_simulation" # Path inside the container where the cache is mounted. Should match HOME/.cache docker_container_cache_path: "/config/.cache" @@ -109,18 +165,47 @@ direct_docker_cli_policy: # Template for renaming the final info.json. rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json" +# Settings for controlling the behavior of dummy/simulation modes. +# These values can be overridden at runtime with the --set flag. +dummy_simulation_settings: + # Timings for dummy auth simulation (per-URL delay in a batch) + auth_min_seconds: 0.1 + auth_max_seconds: 0.5 + auth_failure_rate: 0.0 + auth_skipped_failure_rate: 0.0 + # Timings for dummy download simulation (per-format download time) + download_min_seconds: 1.0 + download_max_seconds: 3.0 + download_failure_rate: 0.0 + download_skipped_failure_rate: 0.0 + queue_policy: # Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox'). # Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox'). use_env_prefix: false - # If specified, create download tasks for these formats - # Can be "all", a specific format ID, or a list of format IDs - formats_to_download: "140-dashy/140-dashy-0/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy" + # Queue to pull URLs from + input_queue: "queue2_auth_inbox" + + # --- Download Handoff Configuration --- + # Set to 'true' for Queue-Auth -> Queue-Download workflow. + # Set to 'false' for Queue-Auth -> File-Download workflow. + create_download_tasks: false + + # Queue to push download tasks to (if create_download_tasks is true) + download_task_queue: "queue2_dl_inbox" # How many tasks a worker should pull from the queue at once. # This will become the batch size for the docker run. - batch_size: 25 + batch_size: 5 + + # If specified, create download tasks for these formats + # Can be "all", a specific format ID, or a list of format IDs + # Defaults to the formats in download_policy.formats + # Example: formats_to_download: "140-dashy,299-dashy" + # Example: formats_to_download: "all" + # Example: formats_to_download: ["140-dashy", "299-dashy"] + formats_to_download: "from_download_policy" simulation_parameters: auth_env: "sim_auth" diff --git a/ytops_client-source/policies/13_queue_download_simulation.yaml b/ytops_client-source/policies/13_queue_download_simulation.yaml index 7397d56..8287ed2 100644 --- a/ytops_client-source/policies/13_queue_download_simulation.yaml +++ b/ytops_client-source/policies/13_queue_download_simulation.yaml @@ -1,16 +1,22 @@ -# Policy: Queue-based Download Simulation via Direct Docker Exec +# Policy: Queue-based Download Simulation # -# This policy simulates a continuous stream of downloads using the -# 'direct_docker_cli' mode with `mode: download_only`. It pulls download -# tasks from a Redis queue, each containing a path to an info.json file, -# and invokes a yt-dlp command inside a running Docker container to perform -# the download. +# This policy simulates a continuous stream of downloads. It pulls download tasks +# from a Redis queue, where each task typically contains a path to an info.json +# file and a format to download. +# +# This policy is designed to be the *second stage* of a two-stage simulation, +# consuming tasks produced by an authentication simulation like: +# - `12_queue_auth_simulation.yaml` (when configured for Queue-Download workflow) +# +# It does not matter to this policy whether the auth stage created tasks per-URL +# or per-format; this worker will simply process whatever task it receives from +# the `input_queue`. # name: 13_queue_download_simulation settings: mode: download_only - orchestration_mode: direct_docker_cli + orchestration_mode: queue_download profile_mode: from_pool_with_lock # In queue mode, info_json_dir is not used to find tasks. # However, the paths inside the download tasks must be accessible @@ -19,12 +25,19 @@ settings: # can be specified in the download task. execution_control: - workers: 4 + # Define worker pools for multiple user groups + worker_pools: + - profile_prefix: "user1" + workers: 1 + - profile_prefix: "user2" + workers: 1 + - profile_prefix: "user3" + workers: 1 # How long a worker should pause if it cannot find an available profile or task. worker_polling_interval_seconds: 1 download_policy: - profile_prefix: "user1" + # profile_prefix is now defined per-pool in execution_control.worker_pools # Default cooldown in seconds if not specified by the enforcer in Redis. # The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy) # will always take precedence. This is a fallback. @@ -65,6 +78,7 @@ direct_docker_cli_policy: docker_container_mount_path: "/config" # Path on the HOST where downloaded files will be saved. + # NOTE: This path should be on a fast, local disk, NOT on s3fs. docker_host_download_path: "downloaded_media/queue_downloads" # Path inside the CONTAINER where `docker_host_download_path` is mounted. docker_container_download_path: "/downloads" @@ -95,11 +109,36 @@ direct_docker_cli_policy: - "Invalid data found when processing input" - "Error opening input files" +# Settings for controlling the behavior of dummy/simulation modes. +# These values can be overridden at runtime with the --set flag. +dummy_simulation_settings: + # Timings for dummy download simulation (per-format download time) + download_min_seconds: 1.0 + download_max_seconds: 3.0 + download_failure_rate: 0.0 + download_skipped_failure_rate: 0.0 + queue_policy: # Set to false to use legacy, unprefixed queue names (e.g., 'queue2_dl_inbox'). # Set to true (or omit) to use environment-prefixed names (e.g., 'sim_download_queue2_dl_inbox'). use_env_prefix: false + # Queue to pull download tasks from + input_queue: "queue2_dl_inbox" + + # Whether to report completion back to a queue + # Can be true (report all), false (report none), or "success_only"/"failure_only" + report_completion: true + + # Queue to report completion to + completion_queue: "queue2_dl_completed" + + # Queue to report failures to (always reported regardless of report_completion) + failure_queue: "queue2_dl_fail" + + # Queue to report skipped tasks to + skipped_queue: "queue2_dl_skipped" + # How many tasks to process in a batch. For downloads, this should be 1, # as each worker locks a profile for a single download task. batch_size: 1 diff --git a/ytops_client-source/policies/6_profile_setup_policy.yaml b/ytops_client-source/policies/6_profile_setup_policy.yaml index 03320db..cdcbd6b 100644 --- a/ytops_client-source/policies/6_profile_setup_policy.yaml +++ b/ytops_client-source/policies/6_profile_setup_policy.yaml @@ -13,13 +13,13 @@ auth_profile_setup: cleanup_before_run: true pools: - prefix: "user1" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1088:1088" count: 3 - prefix: "user2" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1085:1085" count: 3 - prefix: "user3" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1084:1084" count: 3 # --- Profile setup for the DOWNLOAD simulation --- @@ -28,11 +28,11 @@ download_profile_setup: cleanup_before_run: true pools: - prefix: "user1" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1088:1088" count: 3 - prefix: "user2" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1085:1085" count: 3 - prefix: "user3" - proxy: "sslocal-rust-1092:1092" + proxy: "sslocal-rust-1084:1084" count: 3 diff --git a/ytops_client-source/ytops_client/download_aria_tool.py b/ytops_client-source/ytops_client/download_aria_tool.py index df36037..8803854 100644 --- a/ytops_client-source/ytops_client/download_aria_tool.py +++ b/ytops_client-source/ytops_client/download_aria_tool.py @@ -16,13 +16,6 @@ import threading import time from urllib.parse import urljoin -try: - import aria2p - from aria2p.utils import human_readable_bytes - import yt_dlp -except ImportError: - print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr) - sys.exit(1) logger = logging.getLogger('download_aria_tool') @@ -173,6 +166,14 @@ def parse_aria_args_to_options(args_str): def main_download_aria(args): """Main logic for the 'download-aria' command.""" + try: + import aria2p + from aria2p.utils import human_readable_bytes + import yt_dlp + except ImportError: + print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr) + return 1 + log_level = logging.DEBUG if args.verbose else logging.INFO # Reconfigure root logger to ensure our settings are applied. for handler in logging.root.handlers[:]: @@ -405,6 +406,7 @@ def main_download_aria(args): def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None): """Handle downloading a single URL with aria2c.""" + import aria2p if remote_dir: aria_options['dir'] = remote_dir logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...") @@ -532,6 +534,7 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None): """Handle downloading fragmented formats with aria2c.""" + import aria2p logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.") fragment_base_url = target_format.get('fragment_base_url') fragments = target_format['fragments'] diff --git a/ytops_client-source/ytops_client/download_native_py_tool.py b/ytops_client-source/ytops_client/download_native_py_tool.py index 3c519cd..035ca16 100644 --- a/ytops_client-source/ytops_client/download_native_py_tool.py +++ b/ytops_client-source/ytops_client/download_native_py_tool.py @@ -16,12 +16,6 @@ import sys import time from datetime import datetime -try: - import yt_dlp - from yt_dlp.utils import match_filter_func -except ImportError: - print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr) - sys.exit(1) logger = logging.getLogger('download_native_py_tool') @@ -110,6 +104,8 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args): Returns a tuple: (success: bool, ytdlp_logger: YTDLPLogger) """ + import yt_dlp + # Deep copy info_data so we can modify it without affecting other downloads local_info_data = copy.deepcopy(info_data) @@ -178,6 +174,13 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args): def main_download_native_py(args): """Main logic for the 'download-native-py' command.""" + try: + import yt_dlp + from yt_dlp.utils import match_filter_func + except ImportError: + print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr) + return 1 + # All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer. log_stream = sys.stderr log_level = logging.DEBUG if args.verbose else logging.INFO diff --git a/ytops_client-source/ytops_client/policy_enforcer_tool.py b/ytops_client-source/ytops_client/policy_enforcer_tool.py index e7f8f89..238a3a8 100644 --- a/ytops_client-source/ytops_client/policy_enforcer_tool.py +++ b/ytops_client-source/ytops_client/policy_enforcer_tool.py @@ -44,6 +44,7 @@ class PolicyEnforcer: self.manager = manager self.dry_run = dry_run self.actions_taken_this_cycle = 0 + self._last_wait_log_message = "" PROXY_REST_REASON = "Proxy resting" @@ -248,7 +249,19 @@ class PolicyEnforcer: # This prevents a deadlock if all groups are just 'Working' but none are in the 'waiting_downloads' state yet. if not is_any_group_idle and groups_currently_waiting: is_system_blocked_by_downloads = True - logger.info(f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free.") + log_message = f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free." + if log_message != self._last_wait_log_message: + if self._last_wait_log_message: + print(file=sys.stderr) # Newline if we were printing dots + logger.info(log_message) + self._last_wait_log_message = log_message + else: + print(".", end="", file=sys.stderr, flush=True) + else: + # If we are no longer blocked, reset the message tracker + if self._last_wait_log_message: + print(file=sys.stderr) # Newline to clean up after dots + self._last_wait_log_message = "" if is_system_blocked_by_downloads: # When blocked, we only want to consider profiles that are in the 'waiting_downloads' state, @@ -328,6 +341,7 @@ class PolicyEnforcer: # The final list to check is the sorted ready profiles, followed by the not-ready ones. not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', '')))) profiles_to_check = sorted_ready_profiles + not_ready_profiles + logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") else: # Default 'longest_idle' sort if strategy not in ['longest_idle']: @@ -352,6 +366,7 @@ class PolicyEnforcer: # The final list to check will process all ready profiles first, then wait for the not-ready ones. profiles_to_check = ready_profiles + not_ready_profiles + logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}") # --- End New Sorting Logic --- # --- New logic: Identify groups with waiting profiles --- @@ -382,7 +397,7 @@ class PolicyEnforcer: group_name = profile_to_group_map.get(profile_name) # --- New check to prevent activating profiles from a waiting group --- - if group_name in waiting_group_names and profile.get('rest_reason') != 'waiting_downloads': + if group_name in waiting_group_names: logger.debug(f"Profile '{profile_name}' activation deferred because its group '{group_name}' is waiting for downloads to complete.") continue # --- End new logic --- diff --git a/ytops_client-source/ytops_client/profile_manager_tool.py b/ytops_client-source/ytops_client/profile_manager_tool.py index 2b9eea0..96d1687 100644 --- a/ytops_client-source/ytops_client/profile_manager_tool.py +++ b/ytops_client-source/ytops_client/profile_manager_tool.py @@ -102,9 +102,22 @@ class ProfileManager: self.redis.ping() logger.info(f"Successfully connected to Redis.") logger.info(f"Using key prefix: {key_prefix}") + self._last_lock_warning = "" except redis.exceptions.ConnectionError as e: logger.error(f"Failed to connect to Redis at {redis_host}:{redis_port}: {e}") sys.exit(1) + + def _log_lock_warning(self, message: str): + """Logs a lock-related warning, printing dots for repeated messages.""" + if message == self._last_lock_warning: + # Use print to avoid logger formatting and newlines + print(".", end="", file=sys.stderr, flush=True) + else: + # If we were printing dots, start a new line before the new message + if self._last_lock_warning: + print(file=sys.stderr) + logger.warning(message) + self._last_lock_warning = message def _profile_key(self, profile_name: str) -> str: """Get Redis key for a profile.""" @@ -288,11 +301,13 @@ class ProfileManager: 'tolerated_error_count': '0', 'download_count': '0', 'download_error_count': '0', + 'predicted_download_count': '0', 'global_success_count': '0', 'global_failure_count': '0', 'global_tolerated_error_count': '0', 'global_download_count': '0', 'global_download_error_count': '0', + 'global_predicted_download_count': '0', 'lock_timestamp': '0', 'lock_owner': '', 'rest_until': '0', @@ -328,10 +343,10 @@ class ProfileManager: # Convert numeric fields numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count', - 'tolerated_error_count', 'download_count', 'download_error_count', + 'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count', 'global_success_count', 'global_failure_count', 'global_tolerated_error_count', 'global_download_count', - 'global_download_error_count', + 'global_download_error_count', 'global_predicted_download_count', 'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at'] for field in numeric_fields: if field in data: @@ -388,10 +403,10 @@ class ProfileManager: # --- End batch fetch --- numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count', - 'tolerated_error_count', 'download_count', 'download_error_count', + 'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count', 'global_success_count', 'global_failure_count', 'global_tolerated_error_count', 'global_download_count', - 'global_download_error_count', + 'global_download_error_count', 'global_predicted_download_count', 'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at'] for i, data in enumerate(all_profile_data): @@ -527,9 +542,13 @@ class ProfileManager: logger.info(f"Deleted {total_deleted} key(s).") return total_deleted - def record_activity(self, name: str, activity_type: str, - timestamp: Optional[float] = None) -> bool: - """Record activity (success/failure) for a profile.""" + def record_activity(self, name: str, activity_type: str, + timestamp: Optional[float] = None, is_dummy: bool = False) -> bool: + """ + Record activity (success/failure) for a profile. + If is_dummy is True, the activity will NOT be recorded for the associated proxy, + preventing dummy failures from triggering proxy-level enforcer actions. + """ if activity_type not in ['success', 'failure', 'tolerated_error', 'download', 'download_error']: logger.error(f"Invalid activity type: {activity_type}") return False @@ -558,20 +577,50 @@ class ProfileManager: # Keep only last 1000 activities to prevent unbounded growth self.redis.zremrangebyrank(activity_key, 0, -1001) - # Also record activity for the proxy - proxy_url = profile.get('proxy') - if proxy_url: - proxy_activity_key = self._proxy_activity_key(proxy_url, activity_type) - pipe = self.redis.pipeline() - pipe.zadd(proxy_activity_key, {str(ts): ts}) - # Keep last 5000 activities per proxy (higher limit) - pipe.zremrangebyrank(proxy_activity_key, 0, -5001) - pipe.execute() - logger.debug(f"Recorded {activity_type} for proxy '{proxy_url}'") + # Also record activity for the proxy, BUT NOT for dummy activities. + if not is_dummy: + proxy_url = profile.get('proxy') + if proxy_url: + proxy_activity_key = self._proxy_activity_key(proxy_url, activity_type) + pipe = self.redis.pipeline() + pipe.zadd(proxy_activity_key, {str(ts): ts}) + # Keep last 5000 activities per proxy (higher limit) + pipe.zremrangebyrank(proxy_activity_key, 0, -5001) + pipe.execute() + logger.debug(f"Recorded {activity_type} for proxy '{proxy_url}'") - logger.debug(f"Recorded {activity_type} for profile '{name}' at {ts}") + log_msg = f"Recorded {activity_type} for profile '{name}' at {ts}" + if is_dummy: + log_msg += " (dummy, proxy activity skipped)" + logger.debug(log_msg) return True + def record_predicted_downloads(self, name: str, count: int, is_dummy: bool = False) -> bool: + """Records the number of download tasks predicted/created for a profile.""" + if count <= 0: + return False + + profile = self.get_profile(name) + if not profile: + logger.error(f"Profile '{name}' not found") + return False + + ts = time.time() + + # Update counters in profile + profile_key = self._profile_key(name) + self.redis.hincrby(profile_key, 'predicted_download_count', count) + self.redis.hincrby(profile_key, 'global_predicted_download_count', count) + + # Update last_used + self.redis.hset(profile_key, 'last_used', str(ts)) + + log_msg = f"Recorded {count} predicted downloads for profile '{name}'" + if is_dummy: + log_msg += " (dummy, proxy activity skipped)" + logger.debug(log_msg) + return True + def get_activity_rate(self, name: str, activity_type: str, window_seconds: int) -> int: """Get activity count within time window.""" @@ -612,6 +661,7 @@ class ProfileManager: 'tolerated_error_count': '0', 'download_count': '0', 'download_error_count': '0', + 'predicted_download_count': '0', } self.redis.hset(profile_key, mapping=counters_to_reset) logger.info(f"Reset session counters for profile '{name}'.") @@ -630,19 +680,21 @@ class ProfileManager: total_tolerated_error = sum(int(p.get('global_tolerated_error_count', 0)) for p in profiles) total_downloads = sum(int(p.get('global_download_count', 0)) for p in profiles) total_download_errors = sum(int(p.get('global_download_error_count', 0)) for p in profiles) + total_predicted_downloads = sum(int(p.get('global_predicted_download_count', 0)) for p in profiles) return { 'total_success': total_success, 'total_failure': total_failure, 'total_tolerated_error': total_tolerated_error, 'total_downloads': total_downloads, 'total_download_errors': total_download_errors, + 'total_predicted_downloads': total_predicted_downloads, } def get_per_proxy_stats(self) -> Dict[str, Dict[str, Any]]: """Get aggregated stats per proxy.""" profiles = self.list_profiles() proxy_stats = collections.defaultdict(lambda: { - 'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'profiles': 0 + 'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'predicted_downloads': 0, 'profiles': 0 }) for p in profiles: proxy = p.get('proxy') @@ -652,6 +704,7 @@ class ProfileManager: proxy_stats[proxy]['tolerated_error'] += int(p.get('global_tolerated_error_count', 0)) proxy_stats[proxy]['downloads'] += int(p.get('global_download_count', 0)) proxy_stats[proxy]['download_errors'] += int(p.get('global_download_error_count', 0)) + proxy_stats[proxy]['predicted_downloads'] += int(p.get('global_predicted_download_count', 0)) proxy_stats[proxy]['profiles'] += 1 return dict(proxy_stats) @@ -862,14 +915,14 @@ class ProfileManager: # Original logic: find all active profiles, optionally filtered by prefix. active_profiles = self.redis.zrange(self._state_key(ProfileState.ACTIVE.value), 0, -1) if not active_profiles: - logger.warning("No active profiles available to lock.") + self._log_lock_warning("No active profiles available to lock.") self.redis.incr(self._failed_lock_attempts_key()) return None if profile_prefix: profiles_to_check = [p for p in active_profiles if p.startswith(profile_prefix)] if not profiles_to_check: - logger.warning(f"No active profiles with prefix '{profile_prefix}' available to lock.") + self._log_lock_warning(f"No active profiles with prefix '{profile_prefix}' available to lock.") self.redis.incr(self._failed_lock_attempts_key()) return None else: @@ -883,9 +936,9 @@ class ProfileManager: if not full_profiles: if specific_profile_name: - logger.warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).") + self._log_lock_warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).") else: - logger.warning("No active profiles available to lock after filtering.") + self._log_lock_warning("No active profiles available to lock after filtering.") self.redis.incr(self._failed_lock_attempts_key()) return None @@ -898,7 +951,7 @@ class ProfileManager: ] if not eligible_profiles: - logger.warning("No active profiles with an active proxy available to lock.") + self._log_lock_warning("No active profiles with an active proxy available to lock.") self.redis.incr(self._failed_lock_attempts_key()) return None @@ -922,7 +975,7 @@ class ProfileManager: if not current_state or current_state.upper() != ProfileState.ACTIVE.value: # Another process (enforcer) changed the state. Release lock and try next. self.redis.hdel(locks_key, name) - logger.warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.") + self._log_lock_warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.") continue # State is still ACTIVE, proceed with locking. @@ -937,6 +990,7 @@ class ProfileManager: sm.lock(owner=owner) # The on_enter_locked action handles all Redis updates for the profile itself. # The logger messages are also in the action. + self._last_lock_warning = "" # Reset on successful lock return self.get_profile(name) except Exception as e: # This could be a TransitionNotAllowed error if the state changed, @@ -946,7 +1000,7 @@ class ProfileManager: self.redis.hdel(locks_key, name) continue - logger.warning("Could not lock any active profile (all may have been locked by other workers).") + self._log_lock_warning("Could not lock any active profile (all may have been locked by other workers).") self.redis.incr(self._failed_lock_attempts_key()) return None @@ -1016,45 +1070,15 @@ class ProfileManager: # the `on_enter` actions for the current state. We can suppress the initial transition # and set the state directly. - # WORKAROUND for older statemachine library: - # Instantiating the machine triggers an initial transition to ACTIVE, which wrongly updates Redis. - # We let this happen, and then immediately correct the state if it was supposed to be something else. - sm = ProfileStateMachine(manager=self, profile_name=name) - - # The sm is now in ACTIVE state, and Redis has been updated. If the original state was - # LOCKED, we must re-lock it to fix Redis and the state machine object so transitions work. - if current_state_str == ProfileState.LOCKED.value: - lock_owner = profile.get('lock_owner', 're-lock-owner') - try: - # This transition ensures the `on_enter_LOCKED` actions are run, making the - # state consistent in Redis and in the state machine object. - sm.lock(owner=lock_owner) - except Exception as e: - logger.error(f"Failed to re-lock profile '{name}' during state machine hydration: {e}") - # The state is now inconsistent, best to not return a broken machine. - return None - elif current_state_str != sm.current_state.value.upper(): - # For any other state, we must manually fix both the state machine object and Redis, - # as the constructor wrongly transitioned to ACTIVE. - - # 1. Force state on the machine object. This does not trigger actions. - target_state_obj = next((s for s in sm.states if s.value.upper() == current_state_str), None) - if not target_state_obj: - logger.error(f"Could not find state object for '{current_state_str}' during hydration of '{name}'.") - return None - sm.current_state = target_state_obj - - # 2. Manually revert the state in Redis to what it should be. - profile_key = self._profile_key(name) - pipe = self.redis.pipeline() - pipe.hset(profile_key, 'state', current_state_str) - # Atomically move the profile from the incorrect ACTIVE index to the correct one. - # The constructor may have added it to ACTIVE without removing it from its original state index. - pipe.zrem(self._state_key(ProfileState.ACTIVE.value), name) - pipe.zadd(self._state_key(current_state_str), {name: profile.get('last_used', time.time())}) - pipe.execute() - logger.debug(f"Corrected state for '{name}' to '{current_state_str}' in object and Redis during hydration.") - + # By passing `initial_value`, we tell the state machine to start in the + # profile's actual current state from Redis, instead of executing the + # default `initial=True` transition to ACTIVE. This prevents incorrect + # state changes and logs during "hydration" of the state machine. + sm = ProfileStateMachine( + manager=self, + profile_name=name, + initial_value=current_state_str + ) return sm def cleanup_stale_locks(self, max_lock_time_seconds: int) -> int: @@ -1475,36 +1499,77 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co next_up_reason = f"least_loaded (load: {load}, {finish_str})" elif profile_selection_strategy == 'longest_idle': - # Find the single longest idle profile across all groups - ready_profiles = [] + # Find groups that don't have an active profile + groups_without_active = [] for group in profile_groups_config: + profiles_in_group = group.get('profiles_in_group', []) + has_active = False + for p_name in profiles_in_group: + p = all_profiles_by_name.get(p_name) + if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: + has_active = True + break + if not has_active: + groups_without_active.append(group) + + # If all groups have active profiles, we need to find which one will be rotated next + # For now, let's find the group with the profile that has been used the longest + if not groups_without_active: + # Find the active profile with the highest request count (closest to rotation) + active_profiles_info = [] + for group in profile_groups_config: + for p_name in group.get('profiles_in_group', []): + p = all_profiles_by_name.get(p_name) + if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]: + # Calculate total requests + total_reqs = ( + p.get('success_count', 0) + p.get('failure_count', 0) + + p.get('tolerated_error_count', 0) + + p.get('download_count', 0) + p.get('download_error_count', 0) + ) + active_profiles_info.append({ + 'profile': p, + 'group': group, + 'total_reqs': total_reqs, + 'last_used': p.get('last_used', 0) + }) + + if active_profiles_info: + # Sort by total requests descending (highest first - closest to rotation) + # Then by last_used ascending (oldest first) + active_profiles_info.sort(key=lambda x: (-x['total_reqs'], x['last_used'])) + # The first one is most likely to be rotated next + # Find which group should be activated after it + # For simplicity, let's just indicate the current active group + # But this is not ideal + pass + + # Find the longest idle profile from groups without active profiles + ready_profiles = [] + for group in groups_without_active: for p_name in group.get('profiles_in_group', []): p = all_profiles_by_name.get(p_name) if p and p['state'] in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value] and p.get('rest_until', 0) <= now and p.get('rest_reason') != 'waiting_downloads': - ready_profiles.append(p) + ready_profiles.append((p, group)) if ready_profiles: # Sort them according to the 'longest_idle' activation logic - unused_profiles = [p for p in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0] - used_profiles = [p for p in ready_profiles if p not in unused_profiles] + unused_profiles = [(p, g) for p, g in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0] + used_profiles = [(p, g) for p, g in ready_profiles if (p, g) not in unused_profiles] - unused_profiles.sort(key=lambda p: natural_sort_key(p.get('name', ''))) - used_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) + unused_profiles.sort(key=lambda x: natural_sort_key(x[0].get('name', ''))) + used_profiles.sort(key=lambda x: (x[0].get('last_used', 0), natural_sort_key(x[0].get('name', '')))) sorted_ready_profiles = unused_profiles + used_profiles if sorted_ready_profiles: - next_profile = sorted_ready_profiles[0] - # Find which group it belongs to - for group in profile_groups_config: - if next_profile['name'] in group.get('profiles_in_group', []): - next_up_group_name = group['name'] - next_up_reason = profile_selection_strategy - if getattr(args, 'show_reasons', False): - last_used_ts = next_profile.get('last_used', 0) - idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used" - next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})" - break + next_profile, next_group = sorted_ready_profiles[0] + next_up_group_name = next_group['name'] + next_up_reason = profile_selection_strategy + if getattr(args, 'show_reasons', False): + last_used_ts = next_profile.get('last_used', 0) + idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used" + next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})" # --- End new logic --- for group in profile_groups_config: @@ -1672,9 +1737,11 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups if is_auth_sim: row.extend([ p.get('success_count', 0), + p.get('predicted_download_count', 0), p.get('failure_count', 0), p.get('tolerated_error_count', 0), p.get('global_success_count', 0), + p.get('global_predicted_download_count', 0), p.get('global_failure_count', 0), ]) else: # is_download_sim or unknown @@ -1700,7 +1767,7 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups headers = ['Name', 'Proxy', 'State', 'Last Used'] if is_auth_sim: - headers.extend(['AuthOK', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.AuthFail']) + headers.extend(['AuthOK', 'DataPred', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.DataPred', 'Tot.AuthFail']) else: # is_download_sim or unknown headers.extend(['DataOK', 'DownFail', 'Skip.Err', 'Tot.DataOK', 'Tot.DownFail']) @@ -1870,7 +1937,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout): dl_success_rate = (dl_stats['total_downloads'] / total_dls * 100) if total_dls > 0 else 100 global_summary_str = ( - f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | " + f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats.get('total_predicted_downloads', 0)} Tasks, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | " f"OK Rate: {success_rate:.2f}% | " f"Failed Locks: {auth_failed_locks} || " f"Download: {total_dls} attempts ({dl_stats['total_downloads']} OK, {dl_stats['total_download_errors']} Fail) | " @@ -1894,6 +1961,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout): astats.get('profiles', 0), dstats.get('profiles', 0), astats.get('success', 0), + astats.get('predicted_downloads', 0), astats.get('failure', 0), astats.get('tolerated_error', 0), dstats.get('downloads', 0), @@ -1916,7 +1984,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout): if all_proxies: print("\n--- Per-Proxy Stats (Merged) ---", file=file) - proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)'] + proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'DataPred', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)'] print(tabulate(proxy_table_data, headers=proxy_headers, tablefmt='grid'), file=file) print(f"\n--- Auth Simulation Profile Details ({args.auth_env}) ---", file=file) diff --git a/ytops_client-source/ytops_client/profile_statemachine.py b/ytops_client-source/ytops_client/profile_statemachine.py index e3cf7d6..0dfb786 100644 --- a/ytops_client-source/ytops_client/profile_statemachine.py +++ b/ytops_client-source/ytops_client/profile_statemachine.py @@ -51,10 +51,31 @@ class ProfileStateMachine(StateMachine): pause = active.to(paused) | locked.to(paused) | resting.to(paused) | cooldown.to(paused) - def __init__(self, manager: ProfileManager, profile_name: str, *args, **kwargs): + def __init__(self, manager: ProfileManager, profile_name: str, initial_value=None): self.manager = manager self.profile_name = profile_name - super().__init__(*args, **kwargs) + # Call parent constructor with model=self + super().__init__(model=self) + # If initial_value is provided, set the current state without triggering transitions + if initial_value is not None: + # Convert to uppercase to match state values + initial_value = initial_value.upper() + # Check if we're not already in this state + # Compare case-insensitively to handle any discrepancies + if self.current_state.value.upper() != initial_value: + # Find the corresponding state object + target_state = None + for state in self.states: + # Compare both .value and .id case-insensitively + if state.value.upper() == initial_value or (hasattr(state, 'id') and state.id.upper() == initial_value): + target_state = state + break + if target_state: + # Set current state without triggering transitions + self.current_state = target_state + else: + # If state not found, log a warning but don't crash + logger.warning(f"Could not find state '{initial_value}' (case-insensitive) for profile '{profile_name}'. Keeping current state '{self.current_state.value}'.") # --- Action Methods --- diff --git a/ytops_client-source/ytops_client/queue_manager_tool.py b/ytops_client-source/ytops_client/queue_manager_tool.py index 6bd422a..731ed26 100644 --- a/ytops_client-source/ytops_client/queue_manager_tool.py +++ b/ytops_client-source/ytops_client/queue_manager_tool.py @@ -70,12 +70,14 @@ class QueueManager: """Returns the number of items in a queue.""" return self.redis.llen(queue_name) - def populate(self, queue_name: str, file_path: str) -> int: - """Populates a queue from a file (text with one item per line, or JSON with an array of strings).""" + def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int: + """Populates a queue from a file (text with one item per line, or JSON with an array of items).""" count = 0 if file_path.lower().endswith('.json'): - logger.info("Detected JSON file. Attempting to parse as an array of strings.") + if wrap_key: + logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.") + logger.info("Detected JSON file. Attempting to parse as an array of items.") try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) @@ -83,13 +85,21 @@ class QueueManager: logger.error("JSON file must contain a list/array.") return 0 - items_to_add = [str(item).strip() for item in data if str(item).strip()] + # Items can be strings or objects. If objects, they should be converted to JSON strings. + items_to_add = [] + for item in data: + if isinstance(item, str): + items_to_add.append(item.strip()) + else: + items_to_add.append(json.dumps(item)) + + items_to_add = [item for item in items_to_add if item] pipe = self.redis.pipeline() for item in items_to_add: pipe.rpush(queue_name, item) count += 1 - if count % 1000 == 0: + if count > 0 and count % 1000 == 0: pipe.execute() logger.info(f"Pushed {count} items...") pipe.execute() @@ -105,9 +115,13 @@ class QueueManager: for line in f: item = line.strip() if item: - pipe.rpush(queue_name, item) + if wrap_key: + payload = json.dumps({wrap_key: item}) + else: + payload = item + pipe.rpush(queue_name, payload) count += 1 - if count % 1000 == 0: + if count > 0 and count % 1000 == 0: pipe.execute() logger.info(f"Pushed {count} items...") pipe.execute() @@ -118,6 +132,45 @@ class QueueManager: logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.") return count + def push_generated(self, queue_name: str, prefix: str, count: int) -> int: + """Pushes generated payloads to a queue.""" + from datetime import datetime + timestamp = datetime.now().strftime('%Y%m%dt%H%M') + + pipe = self.redis.pipeline() + pushed_count = 0 + for i in range(count): + generated_value = f"{prefix}_{timestamp}_{i:04d}" + payload = json.dumps({"url": generated_value}) + pipe.rpush(queue_name, payload) + pushed_count += 1 + if pushed_count > 0 and pushed_count % 1000 == 0: + pipe.execute() + logger.info(f"Pushed {pushed_count} of {count} items...") + pipe.execute() + logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.") + return pushed_count + + def push_static(self, queue_name: str, payload: str, count: int) -> int: + """Pushes a static payload multiple times to a queue.""" + try: + json.loads(payload) + except json.JSONDecodeError: + logger.error(f"Invalid JSON in --payload-json: {payload}") + return 0 + + pipe = self.redis.pipeline() + pushed_count = 0 + for _ in range(count): + pipe.rpush(queue_name, payload) + pushed_count += 1 + if pushed_count > 0 and pushed_count % 1000 == 0: + pipe.execute() + logger.info(f"Pushed {pushed_count} of {count} items...") + pipe.execute() + logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.") + return pushed_count + def clear(self, queue_name: str, dump_path: Optional[str] = None) -> int: """Clears a queue, optionally dumping its contents to a file.""" size = self.redis.llen(queue_name) @@ -174,10 +227,17 @@ def add_queue_manager_parser(subparsers): peek_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '_stress_inbox'.") peek_parser.add_argument('--count', type=int, default=10, help='Number of items to show (default: 10)') - # Populate command - populate_parser = subparsers.add_parser('populate', help='Populate a queue from a file (one item per line).', parents=[common_parser]) - populate_parser.add_argument('file_path', help='Path to the file containing items to add.') - populate_parser.add_argument('--queue-name', help="Name of the queue to populate. Defaults to '_stress_inbox'.") + # Push command + push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser]) + push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '_stress_inbox'.") + push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).') + + source_group = push_parser.add_mutually_exclusive_group(required=True) + source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).') + source_group.add_argument('--payload-json', help='A static JSON payload to push. Use with --count to push multiple times.') + source_group.add_argument('--generate-payload-prefix', help='Generate JSON payloads with a timestamp and counter. Example: {"url": "PREFIX_yyyymmddthhmm_0001"}. Use with --count.') + + push_parser.add_argument('--wrap-file-line-in-json', metavar='KEY', help="For text files (--from-file), wrap each line in a JSON object with the specified key (e.g., 'url' -> {\"url\": \"line_content\"}).") # Clear command clear_parser = subparsers.add_parser('clear', help='Clear a queue, optionally dumping its contents.', parents=[common_parser]) @@ -215,9 +275,9 @@ def main_queue_manager(args): ) # For commands that operate on a single queue, set a default name based on the environment if not provided. - is_single_queue_command = args.queue_command in ['peek', 'populate', 'clear'] + is_single_queue_command = args.queue_command in ['peek', 'push', 'clear'] if is_single_queue_command: - # `populate` uses an option (--queue-name), while `peek` and `clear` use a positional argument. + # `push`, `peek` and `clear` use a positional argument for queue_name. # We check for `queue_name` attribute and if it's falsy (None or empty string). if not getattr(args, 'queue_name', None): default_queue_name = f"{args.env}_stress_inbox" @@ -244,11 +304,21 @@ def main_queue_manager(args): print(f"{i+1: >3}: {item}") return 0 - elif args.queue_command == 'populate': - if not os.path.exists(args.file_path): - print(f"Error: File not found at '{args.file_path}'", file=sys.stderr) - return 1 - manager.populate(args.queue_name, args.file_path) + elif args.queue_command == 'push': + if args.file_path: + if not os.path.exists(args.file_path): + print(f"Error: File not found at '{args.file_path}'", file=sys.stderr) + return 1 + if args.count > 1: + logger.warning("--count is ignored when using --from-file.") + manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json) + elif args.payload_json: + manager.push_static(args.queue_name, args.payload_json, args.count) + elif args.generate_payload_prefix: + if args.count <= 0: + print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr) + return 1 + manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count) return 0 elif args.queue_command == 'clear': diff --git a/ytops_client-source/ytops_client/requirements.txt b/ytops_client-source/ytops_client/requirements.txt index 9132043..2a15143 100644 --- a/ytops_client-source/ytops_client/requirements.txt +++ b/ytops_client-source/ytops_client/requirements.txt @@ -11,6 +11,9 @@ aria2p # For reading .env files for configuration python-dotenv==1.0.1 +# For 'direct_docker_cli' orchestration mode in stress-policy +docker + # For SOCKS proxy support in client tools PySocks diff --git a/ytops_client-source/ytops_client/stress_policy/arg_parser.py b/ytops_client-source/ytops_client/stress_policy/arg_parser.py index af1c654..6fa5310 100644 --- a/ytops_client-source/ytops_client/stress_policy/arg_parser.py +++ b/ytops_client-source/ytops_client/stress_policy/arg_parser.py @@ -167,7 +167,7 @@ Overridable Policy Parameters via --set: parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") - parser.add_argument('--profile-prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages.") + parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.") parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.') parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.") diff --git a/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py index 82a97e8..52a1bda 100644 --- a/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py +++ b/ytops_client-source/ytops_client/stress_policy/direct_batch_worker.py @@ -53,6 +53,7 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana os.makedirs(save_dir, exist_ok=True) last_used_profile_name = None + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None temp_batch_file = None @@ -93,13 +94,24 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana if profiles_in_prefix: state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}. Pausing for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}." else: - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'. Pausing for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'." + + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stderr, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stderr) + logger.info(f"{base_log_msg} Pausing for {polling_interval}s.") + last_no_task_log_msg = base_log_msg # --- End diagnostic logging --- time.sleep(polling_interval) continue + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" profile_name = locked_profile['name'] proxy_url = locked_profile['proxy'] @@ -403,4 +415,4 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana os.unlink(temp_batch_file) logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] \ No newline at end of file + return [] diff --git a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py index 908e64b..a4fd847 100644 --- a/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py +++ b/ytops_client-source/ytops_client/stress_policy/direct_docker_worker.py @@ -113,6 +113,7 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man return [] last_used_profile_name = None + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None temp_task_dir_host = None @@ -120,11 +121,25 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man # --- Variables for robust finalization --- url_batch_len = 0 batch_started = False - downloads_per_url = 0 # Default to 0, meaning no increment unless configured + num_formats_per_url = 0 + downloads_to_increment = 0 # --- try: - # 1. Lock a profile - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + # 1. Lock a profile, trying any of the specified prefixes + locked_profile = None + prefixes_to_try = [] + if profile_prefix: + prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()] + + if prefixes_to_try: + random.shuffle(prefixes_to_try) + for prefix in prefixes_to_try: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix) + if locked_profile: + break + else: + # Fallback for empty/no prefix, which means lock any available profile + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None) # --- New logic to avoid immediate reuse --- avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) @@ -135,9 +150,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) time.sleep(wait_seconds) - # After waiting, try to lock again. + # After waiting, try to lock again, from any of the available prefixes logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + locked_profile = None + prefixes_to_try = [] + if profile_prefix: + prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()] + + if prefixes_to_try: + random.shuffle(prefixes_to_try) + for prefix in prefixes_to_try: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix) + if locked_profile: + break + else: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None) if locked_profile and locked_profile['name'] == last_used_profile_name: logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") @@ -153,13 +180,24 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man if profiles_in_prefix: state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}." else: - logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'." + + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stdout, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stdout) + logger.info(f"{base_log_msg} Pausing for {polling_interval}s.") + last_no_task_log_msg = base_log_msg # --- End diagnostic logging --- time.sleep(polling_interval) continue + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" profile_name = locked_profile['name'] proxy_url = locked_profile['proxy'] @@ -237,24 +275,33 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man url_batch_len = len(url_batch) batch_started = True - # --- Calculate how many download tasks will be generated --- - # The "pending downloads" counter for an auth profile tracks the number of - # info.json files it has generated that are waiting to be processed. - # Each successful URL fetch creates one info.json file (one task). - # The number of actual media files downloaded from that info.json is - # irrelevant to the auth profile's counter. - downloads_per_url = 0 # Default to 0, meaning no increment unless configured + # --- Calculate how many download tasks will be generated to set the counter --- downloads_per_url_config = gen_policy.get('downloads_per_url') + num_formats_per_url = 0 # Default to no increment + if downloads_per_url_config: - downloads_per_url = 1 # We just need a flag to enable the logic, the count is per-URL. + if isinstance(downloads_per_url_config, int): + num_formats_per_url = downloads_per_url_config + elif downloads_per_url_config == 'from_download_policy': + # Heuristic: count comma-separated groups in the format selector. + # This mirrors how yt-dlp processes multiple format downloads. + d_policy = policy.get('download_policy', {}) + formats_str = d_policy.get('formats', '') + if formats_str: + # Each comma separates a group from which one format is downloaded. + num_formats_per_url = formats_str.count(',') + 1 + else: + num_formats_per_url = 1 # fallback to 1 if formats is empty + elif downloads_per_url_config: + # Fallback for non-int, non-'from_download_policy' truthy values + num_formats_per_url = 1 - if downloads_per_url > 0: - # Increment by the number of URLs in the batch. - downloads_to_increment = url_batch_len + if num_formats_per_url > 0: + downloads_to_increment = url_batch_len * num_formats_per_url profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) - logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch of {url_batch_len} URLs.") + logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for a batch of {url_batch_len} URLs ({num_formats_per_url} format(s)/URL).") else: - logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured. Pending downloads counter will not be incremented for this batch.") + logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or is zero. Pending downloads counter will not be incremented for this batch.") end_idx = start_idx + len(url_batch) logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") @@ -480,14 +527,15 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man logger.error(f"Error during immediate post-processing from log line: {e}") with activity_lock: + is_dummy = args.dummy or args.dummy_batch if post_processed_successfully: live_success_count += 1 logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.") - profile_manager_instance.record_activity(profile_name, 'success') + profile_manager_instance.record_activity(profile_name, 'success', is_dummy=is_dummy) else: live_failure_count += 1 logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.") - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy) # --- End immediate post-processing --- return False @@ -495,9 +543,10 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man for pattern in fatal_error_patterns: if re.search(pattern, line, re.IGNORECASE): with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_failure_count += 1 logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy) if direct_policy.get('ban_on_fatal_error_in_batch'): logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.") profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch') @@ -512,16 +561,18 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man for pattern in tolerated_error_patterns: if re.search(pattern, line, re.IGNORECASE): with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_tolerated_count += 1 logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy) return False # If it's an ERROR: line and not tolerated, it's a failure with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_failure_count += 1 logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy) return False @@ -544,14 +595,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man rand_val = random.random() if rand_val < auth_skipped_rate: logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True) elif rand_val < (auth_skipped_rate + auth_failure_rate): logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=True) else: # Success files_created += 1 - profile_manager_instance.record_activity(profile_name, 'success') + profile_manager_instance.record_activity(profile_name, 'success', is_dummy=True) # Create a dummy file in the temp output dir to simulate success dummy_output_path = Path(temp_task_dir_host) / f"{video_id}.info.json" @@ -644,13 +695,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man finally: if locked_profile and batch_started: # --- Reconcile pending downloads counter --- - if downloads_per_url > 0: - # We incremented by url_batch_len at the start. - # Now we adjust for any URLs that failed to produce an info.json. - # The adjustment is the number of successes minus the number of attempts. - initial_increment = url_batch_len - actual_successes = live_success_count - adjustment = actual_successes - initial_increment + if downloads_to_increment > 0: + # We incremented at the start. Now we adjust for any URLs that failed to produce an info.json. + initial_increment = downloads_to_increment + actual_successes = live_success_count # This is count of successful info.jsons + + expected_downloads_from_successes = actual_successes * num_formats_per_url + + adjustment = expected_downloads_from_successes - initial_increment if adjustment != 0: # The adjustment will be negative, effectively decrementing the counter for each failure. @@ -680,10 +732,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man if cooldown: logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + # For auth simulation, check if profile has pending downloads + # If so, don't apply cooldown to avoid conflicting with enforcer + profile_data = profile_manager_instance.get_profile(locked_profile['name']) + should_apply_cooldown = cooldown + if profile_data: + pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name']) + rest_reason = profile_data.get('rest_reason') + if pending_downloads > 0 or rest_reason == 'waiting_downloads': + should_apply_cooldown = None + logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.") + unlocked_successfully = profile_manager_instance.unlock_profile( locked_profile['name'], owner=owner_id, - rest_for_seconds=cooldown + rest_for_seconds=should_apply_cooldown ) if not unlocked_successfully: logger.error(f"[Worker {worker_id}] FAILED to unlock profile '{locked_profile['name']}'. The profile may be stuck.") @@ -751,7 +814,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr return [] no_task_streak = 0 - last_used_profile_name = None + last_no_task_log_msg = "" task_counter = 0 while not state_manager.shutdown_event.is_set(): locked_profile = None @@ -760,6 +823,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr was_banned_by_parser = False task = None task_id = None + downloads_processed_in_task = 0 try: if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode polling_interval = exec_control.get('worker_polling_interval_seconds', 1) @@ -769,9 +833,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr if profiles_in_prefix: state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}." else: - logger.info(f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'." + + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stdout, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stdout) + full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})" + logger.info(full_log_msg) + last_no_task_log_msg = base_log_msg # --- End diagnostic logging --- time.sleep(polling_interval) if state_manager.shutdown_event.is_set(): continue @@ -844,17 +917,27 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr if claimed_task_path_host: no_task_streak = 0 + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" auth_profile_name, auth_env = None, None info_data = None - # --- Read info.json content and metadata first --- + # In queue mode, the task object is the primary source of truth for auth metadata. + # This check is conditional because in file-based mode, `task` will be None. + if task: + auth_profile_name = task.get('auth_profile_name') + auth_env = task.get('auth_env') + + # --- Read info.json content and use its metadata as a fallback --- try: with open(claimed_task_path_host, 'r', encoding='utf-8') as f: info_data = json.load(f) - # This is critical for decrementing the counter in the finally block - metadata = info_data.get('_ytops_metadata', {}) - auth_profile_name = metadata.get('profile_name') - auth_env = metadata.get('auth_env') + # Fallback to info.json metadata if not present in the task object. + if not auth_profile_name or not auth_env: + metadata = info_data.get('_ytops_metadata', {}) + auth_profile_name = auth_profile_name or metadata.get('profile_name') + auth_env = auth_env or metadata.get('auth_env') except (IOError, json.JSONDecodeError) as e: logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") continue # Skip to finally block to unlock profile @@ -874,7 +957,12 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr source_of_format = "task file" if not format_selection: format_selection = d_policy.get('formats', '') - source_of_format = "policy" + source_of_format = "policy (download_policy.formats)" + + if not format_selection: + ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) + format_selection = ytdlp_config_overrides.get('format', '') + source_of_format = "policy (ytdlp_config_overrides.format)" if not format_selection: logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") @@ -907,17 +995,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr details = f"Dummy skipped failure for format {format_id}" error_type = "DummySkippedFailure" is_tolerated_error = True - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True) elif should_fail_fatal: logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.") details = f"Dummy fatal failure for format {format_id}" error_type = "DummyFailure" - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True) else: logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.") success = True details = f"Dummy success for format {format_id}" - profile_manager_instance.record_activity(profile_name, 'download') + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True) + downloads_processed_in_task += 1 event = { 'type': 'direct_docker_download', @@ -963,7 +1052,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr else: logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) event = { 'type': 'direct_docker_download', 'profile': profile_name, @@ -1066,23 +1155,23 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr def log_parser_callback(line): nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser - # Success is a high-priority check. Only record one success per task. + # Success is a high-priority check. if '[download] 100% of' in line or 'has already been downloaded' in line: with activity_lock: - # Only count one success per task - if live_success_count == 0: - live_success_count += 1 - logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success detected from log.") - profile_manager_instance.record_activity(profile_name, 'download') + is_dummy = args.dummy or args.dummy_batch + live_success_count += 1 + logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success #{live_success_count} detected from log.") + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy) return False # Check for fatal patterns for pattern in fatal_error_patterns: if re.search(pattern, line, re.IGNORECASE): with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_failure_count += 1 logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy) if direct_policy.get('ban_on_fatal_error_in_batch'): logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.") profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download') @@ -1098,16 +1187,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr for pattern in tolerated_error_patterns: if re.search(pattern, line, re.IGNORECASE): with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_tolerated_count += 1 logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy) return False # If it's an ERROR: line and not tolerated, it's a failure with activity_lock: + is_dummy = args.dummy or args.dummy_batch live_failure_count += 1 logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}") - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy) return False @@ -1149,11 +1240,11 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr final_outcome = "download" logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.") # We record a success here as a fallback, in case the log parser missed it. - profile_manager_instance.record_activity(profile_name, 'download') + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch)) else: final_outcome = "download_error" logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.") - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch)) # --- Airflow Directory Logic --- if success and d_policy.get('output_to_airflow_ready_dir'): @@ -1231,6 +1322,9 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details } state_manager.log_event(event) + # Store the number of detected successful downloads to use for decrementing the counter. + downloads_processed_in_task = live_success_count + logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.") # 6. Clean up task file @@ -1274,25 +1368,34 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr auth_manager = get_auth_manager(profile_manager_instance, auth_env) if auth_manager: try: - auth_manager.decrement_pending_downloads(auth_profile_name) - logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'.") + # Fallback: if no downloads were counted (e.g., due to an early exit or parsing issue), + # but the task is being finalized, we must assume all potential downloads for this task + # are "processed" to prevent the auth profile from getting stuck. + if downloads_processed_in_task == 0: + logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.") + ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {}) + formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', '')) + num_formats = formats_str.count(',') + 1 if formats_str else 1 + downloads_processed_in_task = num_formats + logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}") + + if downloads_processed_in_task > 0: + new_count = auth_manager.increment_pending_downloads(auth_profile_name, -downloads_processed_in_task) + logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' by {downloads_processed_in_task} in env '{auth_env}'. New count: {new_count}") except Exception as e: logger.error(f"[Worker {worker_id}] Failed to decrement pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}': {e}", exc_info=True) else: logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") else: - logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})") + logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env}) Task had auth_profile_name: {task.get('auth_profile_name')}, auth_env: {task.get('auth_env')}") if was_banned_by_parser: logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.") else: - last_used_profile_name = locked_profile['name'] cooldown = None + # Only apply cooldown if a task was actually claimed and processed. if claimed_task_path_host: - # Enforcer is the only point where we configure to apply different policies, - # since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously. - # This is like applying a policy across multiple workers/machines without needing to restart each of them. # DESIGN: The cooldown duration is not configured in the worker's policy. # Instead, it is read from a central Redis key. This key is set by the # policy-enforcer, making the enforcer the single source of truth for @@ -1325,7 +1428,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr if isinstance(cooldown_source_value, str) and cooldown_source_value.isdigit(): cooldown = int(cooldown_source_value) logger.debug(f"Determined cooldown from {source_description}: {cooldown_source_value}") - + if cooldown: logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") diff --git a/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py b/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py index 80944d4..d76ce14 100644 --- a/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py +++ b/ytops_client-source/ytops_client/stress_policy/direct_download_worker.py @@ -46,11 +46,14 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m os.makedirs(output_dir, exist_ok=True) no_task_streak = 0 + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None claimed_task_path = None auth_profile_name, auth_env = None, None # For finally block + downloads_to_decrement = 0 + formats_from_metadata, granularity_from_metadata = [], 'per_format' try: # 0. If no tasks were found, pause briefly. if no_task_streak > 0: @@ -61,9 +64,19 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m if profiles_in_prefix: state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) - logger.info(f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s." else: - logger.info(f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") + base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s." + + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stderr, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stderr) # Newline to clean up after dots + + full_log_msg = f"{base_log_msg} (Streak: {no_task_streak})" + logger.info(full_log_msg) + last_no_task_log_msg = base_log_msg # --- End diagnostic logging --- time.sleep(polling_interval) if state_manager.shutdown_event.is_set(): continue @@ -83,6 +96,9 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m if claimed_task_path: no_task_streak = 0 # Reset streak + if last_no_task_log_msg: + print(file=sys.stderr) # Newline to clean up after dots + last_no_task_log_msg = "" # --- Read metadata before processing/deleting file --- try: @@ -91,6 +107,18 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m metadata = info_data.get('_ytops_metadata', {}) auth_profile_name = metadata.get('profile_name') auth_env = metadata.get('auth_env') + # Use .get() without a default to distinguish "not present" (None) from "present and empty" ([]). + raw_formats_from_metadata = metadata.get('formats_requested') + granularity_from_metadata = metadata.get('download_task_granularity', 'per_format') + + # Normalize to a list for simulation logic, while preserving the original raw state for decrement logic. + formats_from_metadata = raw_formats_from_metadata + if formats_from_metadata is None: + formats_from_metadata = [] + elif isinstance(formats_from_metadata, str): + formats_from_metadata = [f.strip() for f in formats_from_metadata.split(',')] + elif not isinstance(formats_from_metadata, list): + formats_from_metadata = [] except (IOError, json.JSONDecodeError) as e: logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") continue # Skip to finally block to unlock profile @@ -152,57 +180,83 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...") if args.dummy or args.dummy_batch: - logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DIRECT DOWNLOAD ==========") + logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DOCKER DOWNLOAD PER-FORMAT SIMULATION ==========") logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}") - logger.info(f"[Worker {worker_id}] Would run command: {' '.join(shlex.quote(s) for s in cmd)}") - logger.info(f"[Worker {worker_id}] With environment: {custom_env}") + + formats_to_simulate = [] + # Prioritize formats from the info.json metadata if the key was present + if raw_formats_from_metadata is not None: + formats_to_simulate = formats_from_metadata + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {len(formats_to_simulate)} formats from info.json: {formats_to_simulate}") + else: + # Fallback to policy file if metadata key is absent + formats_config = d_policy.get('formats') + if isinstance(formats_config, str): + formats_to_simulate = [f.strip() for f in formats_config.split(',')] + elif isinstance(formats_config, list): + formats_to_simulate = formats_config + if not formats_to_simulate and raw_formats_from_metadata is None: + logger.info(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download for backward compatibility.") + formats_to_simulate = ['dummy_format'] + # For counting purposes, we should still decrement appropriately + if granularity_from_metadata == 'per_url': + downloads_to_decrement = 1 + else: + downloads_to_decrement = 1 + + # If granularity was 'per_url', we only decrement by 1, even if multiple + # format selectors were passed (e.g., as one string). + if granularity_from_metadata == 'per_url': + downloads_to_decrement = 1 + else: # per_format or not specified + if raw_formats_from_metadata is not None: + # If key was present, trust the count from metadata. Can be 0. + downloads_to_decrement = len(formats_from_metadata) + else: + # Key was absent. Decrement by the number of formats we are simulating + # from policy, or 1 as a final fallback for old files. + downloads_to_decrement = len(formats_to_simulate) if formats_to_simulate else 1 + dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) min_seconds = dummy_settings.get('download_min_seconds', 0.5) max_seconds = dummy_settings.get('download_max_seconds', 1.5) failure_rate = dummy_settings.get('download_failure_rate', 0.0) skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) - time.sleep(random.uniform(min_seconds, max_seconds)) + for i, format_id in enumerate(formats_to_simulate): + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for format '{format_id}'...") + sim_duration = random.uniform(min_seconds, max_seconds) + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {sim_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).") + time.sleep(sim_duration) - rand_val = random.random() - should_fail_skipped = rand_val < skipped_rate - should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) + rand_val = random.random() + is_skipped = rand_val < skipped_rate + is_fatal = not is_skipped and rand_val < (skipped_rate + failure_rate) + + if is_skipped: + logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download for format '{format_id}'.") + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True) + elif is_fatal: + logger.error(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.") + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True) + else: # success + logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.") + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True) - success = False - details = "" - error_type = None - is_tolerated_error = False - - if should_fail_skipped: - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating skipped download failure for task '{claimed_task_path.name}'.") - details = "Dummy skipped failure" - error_type = "DummySkippedFailure" - is_tolerated_error = True - profile_manager_instance.record_activity(profile_name, 'tolerated_error') - elif should_fail_fatal: - logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal download failure for task '{claimed_task_path.name}'.") - details = "Dummy fatal failure" - error_type = "DummyFailure" - profile_manager_instance.record_activity(profile_name, 'download_error') - else: - logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating download success for task '{claimed_task_path.name}'.") - success = True - details = "Dummy success" - profile_manager_instance.record_activity(profile_name, 'download') - - event = { - 'type': 'direct_download', - 'profile': profile_name, - 'proxy_url': locked_profile['proxy'], - 'success': success, - 'details': details, - 'error_type': error_type, - 'is_tolerated_error': is_tolerated_error - } - state_manager.log_event(event) - logger.info(f"========== [Worker {worker_id}] END DUMMY DIRECT DOWNLOAD ==========") + logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========") else: + # Determine downloads_to_decrement based on metadata + if granularity_from_metadata == 'per_url': + downloads_to_decrement = 1 + else: # per_format or not specified + if raw_formats_from_metadata is not None: + # If key was present, trust the count. It can be 0. + downloads_to_decrement = len(formats_from_metadata) + else: + # Key was absent, fall back to 1 for old info.jsons + downloads_to_decrement = 1 + # --- Real execution --- logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") retcode, stdout, stderr = run_command( @@ -268,7 +322,13 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m if claimed_task_path and auth_profile_name and auth_env: auth_manager = get_auth_manager(profile_manager_instance, auth_env) if auth_manager: - auth_manager.decrement_pending_downloads(auth_profile_name) + if downloads_to_decrement > 0: + auth_manager.increment_pending_downloads(auth_profile_name, count=-downloads_to_decrement) + elif downloads_to_decrement == 0: + logger.warning(f"[Worker {worker_id}] `downloads_to_decrement` was zero. Pending downloads counter for '{auth_profile_name}' will not be changed.") + else: + # This shouldn't happen, but log it + logger.error(f"[Worker {worker_id}] `downloads_to_decrement` was negative: {downloads_to_decrement}") else: logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") elif claimed_task_path: diff --git a/ytops_client-source/ytops_client/stress_policy/queue_workers.py b/ytops_client-source/ytops_client/stress_policy/queue_workers.py index ee91930..8fd40de 100644 --- a/ytops_client-source/ytops_client/stress_policy/queue_workers.py +++ b/ytops_client-source/ytops_client/stress_policy/queue_workers.py @@ -20,6 +20,7 @@ from copy import deepcopy from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any, Tuple, Union +from urllib.parse import urlparse, parse_qs from . import utils as sp_utils from .process_runners import run_command, run_docker_container, get_worker_id @@ -36,6 +37,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage exec_control = policy.get('execution_control', {}) gen_policy = policy.get('info_json_generation_policy', {}) queue_policy = policy.get('queue_policy', {}) + task_granularity = gen_policy.get('download_task_granularity', 'per_format') profile_prefix = gen_policy.get('profile_prefix') if not profile_prefix: @@ -54,35 +56,58 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage batch_size = queue_policy.get('batch_size', 1) logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.") task_counter = 0 + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None tasks = [] try: - # 1. Lock a profile FIRST - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + # 1. Get a batch of tasks from the queue FIRST + tasks = state_manager.get_auth_tasks_batch(batch_size) + if not tasks: + polling_interval = exec_control.get('worker_polling_interval_seconds', 1) + base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling." + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stdout, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stdout) + logger.info(base_log_msg) + last_no_task_log_msg = base_log_msg + time.sleep(polling_interval) + continue + + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" + + # 2. Lock a profile, trying any of the specified prefixes + locked_profile = None + prefixes_to_try = [] + if profile_prefix: + prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()] + + if prefixes_to_try: + random.shuffle(prefixes_to_try) + for prefix in prefixes_to_try: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix) + if locked_profile: + break + else: + # Fallback for empty/no prefix, which means lock any available profile + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None) + if not locked_profile: polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.debug(f"[Worker {worker_id}] No profiles available to lock. Sleeping for {polling_interval}s.") + logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.") + state_manager.add_auth_tasks_batch(tasks) time.sleep(polling_interval) continue profile_name = locked_profile['name'] proxy_url = locked_profile['proxy'] - # 2. Get a batch of tasks from the queue - tasks = state_manager.get_auth_tasks_batch(batch_size) - if not tasks: - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.debug(f"[Worker {worker_id}] No tasks available for profile '{profile_name}'. Unlocking and sleeping for {polling_interval}s.") - # Unlock immediately since we have no work to do. - # No cooldown is applied here to make the profile available again quickly. - profile_manager_instance.unlock_profile(profile_name, owner=owner_id) - locked_profile = None # To prevent double-unlock in finally - time.sleep(polling_interval) - continue - logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.") # 3. Process each task in the batch @@ -129,13 +154,34 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage stderr = "Dummy fatal failure" else: success = True - video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" + # In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id' + try: + # First, try the robust library function + video_id = sp_utils.get_video_id(url) + if not video_id and url: + # Fallback parsing for dummy URLs like "...?v=dummy_0099" + parsed_url = urlparse(url) + video_id = parse_qs(parsed_url.query).get('v', [None])[0] + + # Additional fallback for URLs like "youtube.com/watch?v=dummy_0028" + if not video_id and 'dummy_' in url: + dummy_match = re.search(r'dummy_\d+', url) + if dummy_match: + video_id = dummy_match.group(0) + except Exception: + video_id = None # Ensure video_id is None on parsing failure + + # If all extraction methods fail, use the task_id as a reliable fallback. + if not video_id: + logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.") + video_id = task_id + info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]} else: client, req_params = state_manager.get_client_for_request(profile_name, gen_policy) cmd = [ sys.executable, '-m', 'ytops_client.cli', 'get-info', - '--client', client, '--profile', profile_name + '--client', client or '', '--profile', profile_name ] if proxy_url: cmd.extend(['--proxy', proxy_url]) @@ -173,61 +219,143 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage if success and info_data: try: auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') - info_data['_ytops_metadata'] = { - 'profile_name': profile_name, 'proxy_url': proxy_url, - 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), - 'task_id': task_id, 'url': url, 'auth_env': auth_env_name - } - - final_path = None - if save_dir: - video_id = info_data.get('id', 'unknown') - sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' - new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.json" - final_path = os.path.join(save_dir, new_name) - with open(final_path, 'w', encoding='utf-8') as f: - json.dump(info_data, f, indent=2) - logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'") - else: - # This case means auth-only (no downloads) and no save_dir specified. - # The info.json is not persisted. - logger.debug(f"[Worker {worker_id}] [{profile_name}] Auth-only task succeeded. No save_dir, so info.json is not saved.") - - profile_manager_instance.record_activity(profile_name, 'success') + is_dummy_run = args.dummy or args.dummy_batch + # --- 1. Determine which formats will be requested --- formats_to_download = queue_policy.get('formats_to_download') - download_tasks = [] + requested_formats_list = [] if formats_to_download: - task_formats = [] - if formats_to_download == 'all': - task_formats = [f['format_id'] for f in info_data.get('formats', [])] - elif isinstance(formats_to_download, list): - task_formats = formats_to_download - else: - task_formats = [str(formats_to_download)] - - for format_id in task_formats: - download_tasks.append({ + if task_granularity == 'per_url': + format_selector = formats_to_download + if isinstance(formats_to_download, list): + format_selector = ",".join(formats_to_download) + requested_formats_list = [format_selector] + else: # per_format + formats_source = formats_to_download + if formats_source == 'from_download_policy': + formats_source = policy.get('download_policy', {}).get('formats') + + if formats_source == 'all': + requested_formats_list = [f['format_id'] for f in info_data.get('formats', [])] + elif isinstance(formats_source, list): + requested_formats_list = formats_source + elif isinstance(formats_source, str): + # A comma-separated string of format groups + requested_formats_list = [f.strip() for f in formats_source.split(',')] + elif formats_source: + # Handles integer format IDs or other non-string/list values + requested_formats_list = [str(formats_source)] + else: + # formats_source is None or empty + requested_formats_list = [] + + # --- 2. Create download tasks and save info.json(s) based on granularity --- + download_tasks_to_create = [] + created_file_paths = [] + + if task_granularity == 'per_format' and requested_formats_list: + for i, format_id in enumerate(requested_formats_list): + # Deepcopy to avoid modifying the original info_data in the loop + task_info_data = deepcopy(info_data) + + metadata = { + 'profile_name': profile_name, 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, + 'formats_requested': [format_id], # This task is for ONE format + 'download_task_granularity': task_granularity + } + if is_dummy_run: metadata['_dummy'] = True + task_info_data['_ytops_metadata'] = metadata + + final_path = None + if save_dir: + video_id = task_info_data.get('id') or task_id or 'unknown_video_id' + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' + # Sanitize format_id for use in filename + sanitized_format = re.sub(r'[^a-zA-Z0-9_-]', '_', format_id) + # Add index `i` to guarantee uniqueness even if sanitized format IDs clash + new_name = f"{video_id}-{profile_name}-{sanitized_proxy}-fmt{i}_{sanitized_format}.info.json" + final_path = os.path.join(save_dir, new_name) + with open(final_path, 'w', encoding='utf-8') as f: + json.dump(task_info_data, f, indent=2) + logger.info(f"[Worker {worker_id}] [{profile_name}] Saved format-specific info.json to '{final_path}'") + created_file_paths.append(final_path) + + download_tasks_to_create.append({ 'info_json_path': final_path, 'format_id': format_id, - 'video_id': info_data.get('id'), 'url': url, + 'video_id': task_info_data.get('id'), 'url': url, 'auth_profile_name': profile_name, 'proxy_url': proxy_url, 'auth_env': auth_env_name, 'original_task': task }) + else: # per_url or default (including per_format with no formats) + metadata = { + 'profile_name': profile_name, 'proxy_url': proxy_url, + 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), + 'task_id': task_id, 'url': url, 'auth_env': auth_env_name, + 'formats_requested': requested_formats_list, + 'download_task_granularity': task_granularity + } + if is_dummy_run: metadata['_dummy'] = True + info_data['_ytops_metadata'] = metadata + + final_path = None + if save_dir: + video_id = info_data.get('id') or task_id or 'unknown_video_id' + sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' + new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.info.json" + final_path = os.path.join(save_dir, new_name) + with open(final_path, 'w', encoding='utf-8') as f: + json.dump(info_data, f, indent=2) + logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'") + created_file_paths.append(final_path) + + # For per_url, requested_formats_list should contain a single format selector string. + # The loop will run once, creating one task. + if requested_formats_list: + for format_selector in requested_formats_list: + download_tasks_to_create.append({ + 'info_json_path': final_path, 'format_id': format_selector, + 'video_id': info_data.get('id'), 'url': url, + 'auth_profile_name': profile_name, 'proxy_url': proxy_url, + 'auth_env': auth_env_name, + 'original_task': task + }) + else: # Handle the case where requested_formats_list is empty + logger.debug(f"[Worker {worker_id}] [{profile_name}] No formats requested, no download tasks created.") + + + profile_manager_instance.record_activity(profile_name, 'success', is_dummy=(args.dummy or args.dummy_batch)) - if download_tasks: - added_count = state_manager.add_download_tasks_batch(download_tasks) - logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue") - profile_manager_instance.increment_pending_downloads(profile_name, count=added_count) + # --- 3. Dispatch download tasks --- + create_download_tasks = queue_policy.get('create_download_tasks', True) + if download_tasks_to_create: + num_tasks_to_process = 0 + if create_download_tasks: + added_count = state_manager.add_download_tasks_batch(download_tasks_to_create) + logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue") + num_tasks_to_process = added_count + else: + num_tasks = len(download_tasks_to_create) + logger.info(f"[Worker {worker_id}] [{profile_name}] File-based workflow: created {num_tasks} tasks (no queue tasks created)") + num_tasks_to_process = num_tasks + + if num_tasks_to_process > 0: + profile_manager_instance.increment_pending_downloads(profile_name, count=num_tasks_to_process) + profile_manager_instance.record_predicted_downloads(profile_name, count=num_tasks_to_process, is_dummy=(args.dummy or args.dummy_batch)) - state_manager.report_auth_success(task_id, { + report_payload = { "url": url, "video_id": info_data.get('id'), "profile_name": profile_name, - "proxy_url": proxy_url, "info_json_path": final_path, - "download_tasks_created": len(download_tasks) - }) + "proxy_url": proxy_url, "info_json_path": created_file_paths[0] if len(created_file_paths) == 1 else created_file_paths, + "download_tasks_created": len(download_tasks_to_create) + } + if is_dummy_run: + report_payload['_dummy'] = True + state_manager.report_auth_success(task_id, report_payload) except Exception as e: logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True) - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url}) else: is_bot_error = "Sign in to confirm you're not a bot" in stderr @@ -240,19 +368,19 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage if is_unavailable or is_private or is_deleted or is_dummy_skipped: reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped" logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr}) else: error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}" logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}") - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode}) except Exception as e: logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True) if task_id: state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"}) - profile_manager_instance.record_activity(profile_name, 'failure') + profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch)) finally: if temp_task_dir and os.path.exists(temp_task_dir): shutil.rmtree(temp_task_dir) @@ -262,7 +390,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage except Exception as e: logger.error(f"[Worker {worker_id}] Unexpected error in outer worker loop: {e}", exc_info=True) if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'failure') + profile_manager_instance.record_activity(locked_profile['name'], 'failure', is_dummy=(args.dummy or args.dummy_batch)) finally: if locked_profile: @@ -276,16 +404,27 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage elif isinstance(val, int): cooldown = val except (json.JSONDecodeError, TypeError): - if cooldown_config.isdigit(): + if isinstance(cooldown_config, str) and cooldown_config.isdigit(): cooldown = int(cooldown_config) - if cooldown: - logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") + # For auth simulation, check if profile has pending downloads + # If so, don't apply cooldown to avoid conflicting with enforcer + profile_data = profile_manager_instance.get_profile(locked_profile['name']) + should_apply_cooldown = cooldown + if profile_data: + pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name']) + rest_reason = profile_data.get('rest_reason') + if pending_downloads > 0 or rest_reason == 'waiting_downloads': + should_apply_cooldown = None + logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.") + + if should_apply_cooldown: + logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {should_apply_cooldown}s.") profile_manager_instance.unlock_profile( locked_profile['name'], owner=owner_id, - rest_for_seconds=cooldown + rest_for_seconds=should_apply_cooldown ) logger.info(f"[Worker {worker_id}] Queue auth worker exiting.") @@ -311,6 +450,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'") task_counter = 0 + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None @@ -322,7 +462,14 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma task = state_manager.get_download_task() if not task: polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.debug(f"[Worker {worker_id}] No download tasks available in queue. Sleeping for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No download tasks available in queue." + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stderr, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stderr) + logger.debug(f"{base_log_msg} Sleeping for {polling_interval}s.") + last_no_task_log_msg = base_log_msg time.sleep(polling_interval) continue @@ -377,20 +524,45 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma if specific_profile: locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile) if not locked_profile: - logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile with prefix.") - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) - else: - locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) + logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile from configured prefixes.") + + if not locked_profile: + # Lock from any of the specified prefixes + prefixes_to_try = [] + if profile_prefix: + prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()] + + if prefixes_to_try: + random.shuffle(prefixes_to_try) + for prefix in prefixes_to_try: + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix) + if locked_profile: + break + + if not locked_profile: + # Fallback for empty/no prefix, or if no profile from list was lockable + locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None) if not locked_profile: polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.warning(f"[Worker {worker_id}] No profiles available for task {task_id}. Re-queueing and sleeping for {polling_interval}s.") + base_log_msg = f"[Worker {worker_id}] No profiles available, polling." + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stderr, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stderr) + logger.info(f"{base_log_msg} Re-queueing task {task_id} and sleeping for {polling_interval}s.") + last_no_task_log_msg = base_log_msg + # Re-queue the task by adding it back to the inbox. state_manager.add_download_tasks_batch([task]) # The 'in_progress' marker for this attempt will be cleaned up by the finally block. time.sleep(polling_interval) continue + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" profile_name = locked_profile['name'] proxy_url = locked_profile['proxy'] logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' with proxy '{proxy_url}'") @@ -480,7 +652,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma except Exception as e: logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True) - profile_manager_instance.record_activity(profile_name, 'download_success') + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_success(task_id, { "video_id": video_id, "url": url, "format_id": format_id, "profile_name": profile_name, "proxy_url": proxy_url, @@ -508,7 +680,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma if is_unavailable or is_format_error: logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}") - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_skipped(task_id, { "video_id": video_id, "url": url, "format_id": format_id, "reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr @@ -516,23 +688,13 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma else: error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}" logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}") - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch)) state_manager.report_download_failure(task_id, { "video_id": video_id, "url": url, "format_id": format_id, "error_type": error_type, "stderr": stderr, "exit_code": retcode, "original_task": task }) - # Decrement pending downloads counter on the original auth profile, regardless of outcome - if auth_profile_name and auth_env: - auth_manager = get_auth_manager(profile_manager_instance, auth_env) - if auth_manager: - auth_manager.decrement_pending_downloads(auth_profile_name) - else: - logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") - elif task: # Only warn if we had a task but couldn't get metadata - logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented.") - except Exception as e: logger.error(f"[Worker {worker_id}] Unexpected error: {e}", exc_info=True) if task and task_id: @@ -543,9 +705,22 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma "error": f"Unexpected error: {str(e)}", "original_task": task }) if locked_profile: - profile_manager_instance.record_activity(locked_profile['name'], 'download_error') + profile_manager_instance.record_activity(locked_profile['name'], 'download_error', is_dummy=(args.dummy or args.dummy_batch)) finally: + # Decrement pending downloads counter on the original auth profile, regardless of outcome. + # This is critical for releasing the auth profile from the 'waiting_downloads' state. + if task: # Only attempt to decrement if a task was actually pulled. + if auth_profile_name and auth_env: + auth_manager = get_auth_manager(profile_manager_instance, auth_env) + if auth_manager: + new_count = auth_manager.decrement_pending_downloads(auth_profile_name) + logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'. New count: {new_count}") + else: + logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") + else: + logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.") + if locked_profile: cooldown = None cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') diff --git a/ytops_client-source/ytops_client/stress_policy/state_manager.py b/ytops_client-source/ytops_client/stress_policy/state_manager.py index af139c7..21b1bf4 100644 --- a/ytops_client-source/ytops_client/stress_policy/state_manager.py +++ b/ytops_client-source/ytops_client/stress_policy/state_manager.py @@ -904,6 +904,22 @@ class StateManager: return self.queue_provider.remove_in_progress(self.queue_provider.AUTH_PROGRESS, task_id) + def add_auth_task(self, task: Dict) -> bool: + """Add an authentication task to the queue.""" + if not self.queue_provider: + logger.error("Queue provider not initialized") + return False + + return self.queue_provider.add_task(self.queue_provider.AUTH_INBOX, task) + + def add_auth_tasks_batch(self, tasks: List[Dict]) -> int: + """Add a batch of authentication tasks to the queue.""" + if not self.queue_provider: + logger.error("Queue provider not initialized") + return 0 + + return self.queue_provider.add_tasks_batch(self.queue_provider.AUTH_INBOX, tasks) + def add_download_task(self, task: Dict) -> bool: """Add a download task to the queue.""" if not self.queue_provider: diff --git a/ytops_client-source/ytops_client/stress_policy/throughput_worker.py b/ytops_client-source/ytops_client/stress_policy/throughput_worker.py index 25bcb95..74edd2d 100644 --- a/ytops_client-source/ytops_client/stress_policy/throughput_worker.py +++ b/ytops_client-source/ytops_client/stress_policy/throughput_worker.py @@ -40,6 +40,7 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage return [] no_task_streak = 0 + last_no_task_log_msg = "" while not state_manager.shutdown_event.is_set(): locked_profile = None @@ -48,7 +49,15 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage # 0. If no tasks were found previously, pause briefly. if no_task_streak > 0: polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.info(f"[Worker {worker_id}] No tasks found in previous attempt(s). Pausing for {polling_interval}s. (Streak: {no_task_streak})") + base_log_msg = f"[Worker {worker_id}] No available tasks found for any active profiles." + if base_log_msg == last_no_task_log_msg: + print(".", end="", file=sys.stderr, flush=True) + else: + if last_no_task_log_msg: + print(file=sys.stderr) + full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})" + logger.info(full_log_msg) + last_no_task_log_msg = base_log_msg time.sleep(polling_interval) if state_manager.shutdown_event.is_set(): continue @@ -60,9 +69,6 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage if not locked_profile: # No task/profile combo was available. no_task_streak += 1 - polling_interval = exec_control.get('worker_polling_interval_seconds', 1) - logger.info(f"[Worker {worker_id}] No available tasks found for any active profiles. Pausing for {polling_interval}s.") - time.sleep(polling_interval) continue profile_name = locked_profile['name'] @@ -70,6 +76,9 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage # We have a task and a lock. if claimed_task_path: no_task_streak = 0 # Reset streak + if last_no_task_log_msg: + print(file=sys.stderr) + last_no_task_log_msg = "" # 3. Process the task try: with open(claimed_task_path, 'r', encoding='utf-8') as f: @@ -155,4 +164,4 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage # immediately try to lock a new profile to maximize throughput. logger.info(f"[Worker {worker_id}] Worker loop finished.") - return [] # This function doesn't return results directly \ No newline at end of file + return [] # This function doesn't return results directly diff --git a/ytops_client-source/ytops_client/stress_policy/utils.py b/ytops_client-source/ytops_client/stress_policy/utils.py index a9f1932..1c9d521 100644 --- a/ytops_client-source/ytops_client/stress_policy/utils.py +++ b/ytops_client-source/ytops_client/stress_policy/utils.py @@ -319,15 +319,17 @@ def apply_overrides(policy, overrides): return policy -def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None): +def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None): """Prints a human-readable summary of the effective policy.""" logger = logging.getLogger(__name__) logger.info(f"--- Effective Policy: {name} ---") settings = policy.get('settings', {}) exec_control = policy.get('execution_control', {}) orchestration_mode = settings.get('orchestration_mode') - + logger.info(f"Mode: {settings.get('mode', 'full_stack')}") + if args and args.profile_prefix: + logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}") if profile_names: num_profiles = len(profile_names) logger.info(f"Profiles found: {num_profiles}") diff --git a/ytops_client-source/ytops_client/stress_policy/worker_utils.py b/ytops_client-source/ytops_client/stress_policy/worker_utils.py index ec1e63a..182eeef 100644 --- a/ytops_client-source/ytops_client/stress_policy/worker_utils.py +++ b/ytops_client-source/ytops_client/stress_policy/worker_utils.py @@ -188,7 +188,7 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy if not info_json_dir: return None, None - logger.info(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...") + logger.debug(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...") # 1. Get all available task files and group by profile try: @@ -223,7 +223,21 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy # 2. Get ACTIVE profiles from Redis. active_profiles = profile_manager.list_profiles(state_filter='ACTIVE') - active_profile_names = {p['name'] for p in active_profiles if p['name'].startswith(profile_prefix or '')} + + prefixes_to_check = [] + if profile_prefix: + prefixes_to_check = [p.strip() for p in profile_prefix.split(',') if p.strip()] + + active_profile_names = set() + if prefixes_to_check: + for p in active_profiles: + for prefix in prefixes_to_check: + if p['name'].startswith(prefix): + active_profile_names.add(p['name']) + break + else: + # If no prefixes are specified, consider all active profiles. + active_profile_names = {p['name'] for p in active_profiles} # 3. Find profiles that are both ACTIVE and have tasks. candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys())) @@ -918,7 +932,8 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_ result['proxy_url'] = proxy_url if profile_manager_instance and profile_name: - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + is_dummy = args.dummy or args.dummy_batch + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy) state_manager.log_event(result) results.append(result) @@ -960,14 +975,15 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_ # --- Record activity on the DOWNLOAD profile that performed the work --- if profile_manager_instance and profile_name: + is_dummy = args.dummy or args.dummy_batch if result.get('success'): - profile_manager_instance.record_activity(profile_name, 'download') + profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy) elif result.get('error_type') == 'Cancelled': pass # Do not record cancellations elif result.get('is_tolerated_error'): - profile_manager_instance.record_activity(profile_name, 'tolerated_error') + profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy) else: - profile_manager_instance.record_activity(profile_name, 'download_error') + profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy) state_manager.log_event(result) results.append(result) diff --git a/ytops_client-source/ytops_client/stress_policy_tool.py b/ytops_client-source/ytops_client/stress_policy_tool.py index afed41c..cfc9463 100644 --- a/ytops_client-source/ytops_client/stress_policy_tool.py +++ b/ytops_client-source/ytops_client/stress_policy_tool.py @@ -495,7 +495,7 @@ def main_stress_policy(args): exec_control['workers'] = calculated_workers logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').") - sp_utils.display_effective_policy(policy, policy_name, sources=[], original_workers_setting=original_workers_setting) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[], original_workers_setting=original_workers_setting) if args.dry_run: return 0 workers = exec_control.get('workers', 1) @@ -578,15 +578,45 @@ def main_stress_policy(args): logger.info(f"Starting/resuming from URL index {start_index + 1}.") # The worker's get_next_url_batch will respect this starting index. - sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) + sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) if args.dry_run: return 0 - workers = exec_control.get('workers', 1) - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_direct_batch_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + for i in range(pool_workers): + assigned_prefix = prefixes[i % len(prefixes)] + worker_policy = deepcopy(policy) + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix + worker_specs.append({ + 'func': run_direct_batch_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] # Wait for all workers to complete. They will exit their loops when no URLs are left. concurrent.futures.wait(futures) if shutdown_event.is_set(): @@ -662,14 +692,46 @@ def main_stress_policy(args): if start_index > 0: logger.info(f"Starting/resuming from URL index {start_index + 1}.") - sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) + sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list) if args.dry_run: return 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_direct_docker_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_docker_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] # This worker runs until shutdown, like the download worker shutdown_event.wait() logger.info("Shutdown signal received, waiting for direct docker workers to finish...") @@ -686,14 +748,46 @@ def main_stress_policy(args): logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") return 1 - sp_utils.display_effective_policy(policy, policy_name, sources=[]) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_direct_docker_download_worker, i, policy, state_manager, args, profile_manager_instance, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('download_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_docker_download_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] # This worker runs until shutdown shutdown_event.wait() logger.info("Shutdown signal received, waiting for direct docker download workers to finish...") @@ -726,15 +820,46 @@ def main_stress_policy(args): logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") return 1 - sp_utils.display_effective_policy(policy, policy_name, sources=[]) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 - workers = exec_control.get('workers', 1) - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_direct_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('download_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_direct_download_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] shutdown_event.wait() logger.info("Shutdown signal received, waiting for direct download workers to finish...") concurrent.futures.wait(futures) @@ -786,6 +911,15 @@ def main_stress_policy(args): logger.error(f"Failed to create save directory '{save_dir}': {e}") return 1 + # In dummy mode, do not populate the queue. Warn the user instead. + if args.dummy or args.dummy_batch: + input_queue = queue_policy.get('input_queue', 'queue2_auth_inbox') + logger.warning("--- Dummy Mode Notice ---") + logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.") + logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.") + logger.warning(f"You can populate it using another tool, for example: ./bin/ytops-client queue push {input_queue} --payload-json '{{\"url\":\"https://youtu.be/dQw4w9WgXcQ\"}}' --count 100") + logger.warning("-------------------------") + # Requeue failed tasks if requested if args.requeue_failed: requeued = state_manager.requeue_failed_auth_tasks( @@ -793,15 +927,46 @@ def main_stress_policy(args): ) logger.info(f"Requeued {requeued} failed authentication tasks.") - sp_utils.display_effective_policy(policy, policy_name, sources=[]) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 - workers = exec_control.get('workers', 1) - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_queue_auth_worker, i, policy, state_manager, args, auth_manager, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_queue_auth_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, auth_manager, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] shutdown_event.wait() logger.info("Shutdown signal received, waiting for queue auth workers to finish...") concurrent.futures.wait(futures) @@ -843,6 +1008,17 @@ def main_stress_policy(args): env_prefix=env_prefix ) + # In dummy mode, do not populate the queue. Warn the user instead. + if args.dummy or args.dummy_batch: + input_queue = queue_policy.get('input_queue', 'queue2_dl_inbox') + logger.warning("--- Dummy Mode Notice ---") + logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.") + logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.") + logger.warning("-------------------------") + + # Get download policy + d_policy = policy.get('download_policy', {}) + # Create output directory if specified output_dir = d_policy.get('output_dir') if output_dir: @@ -860,15 +1036,46 @@ def main_stress_policy(args): ) logger.info(f"Requeued {requeued} failed download tasks.") - sp_utils.display_effective_policy(policy, policy_name, sources=[]) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 - workers = exec_control.get('workers', 1) - with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - futures = [ - executor.submit(run_queue_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) - for i in range(workers) - ] + worker_pools = exec_control.get('worker_pools', []) + if not worker_pools and exec_control.get('workers'): + prefix = policy.get('download_policy', {}).get('profile_prefix') + worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'}) + + if not worker_pools: + logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).") + return 1 + + if args.profile_prefix: + logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.") + for pool in worker_pools: + pool['profile_prefix'] = args.profile_prefix + + worker_specs = [] + worker_id_counter = 0 + for pool in worker_pools: + pool_workers = pool.get('workers', 1) + prefix_str = pool.get('profile_prefix', '') + prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()] + if not prefixes: + logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}") + continue + + # Each worker in the pool gets the full list of prefixes from the pool configuration. + for i in range(pool_workers): + worker_policy = deepcopy(policy) + # The worker functions will now handle a comma-separated list of prefixes. + worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str + worker_specs.append({ + 'func': run_queue_download_worker, + 'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock) + }) + worker_id_counter += 1 + + with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor: + futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs] shutdown_event.wait() logger.info("Shutdown signal received, waiting for queue download workers to finish...") concurrent.futures.wait(futures) @@ -949,7 +1156,7 @@ def main_stress_policy(args): ) logger.info(f"Requeued {requeued_auth} failed authentication tasks and {requeued_dl} failed download tasks.") - sp_utils.display_effective_policy(policy, policy_name, sources=[]) + sp_utils.display_effective_policy(policy, policy_name, args, sources=[]) if args.dry_run: return 0 # Start both auth and download workers @@ -1127,6 +1334,7 @@ def main_stress_policy(args): sp_utils.display_effective_policy( policy, policy_name, + args, sources=sources, profile_names=None, # Profile grouping is removed original_workers_setting=original_workers_setting @@ -1254,12 +1462,22 @@ def main_stress_policy(args): # If marking by rename is on, do that. if settings.get('mark_processed_files'): try: + processed_dir = settings.get('processed_files_dir') timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - new_path = source.parent / f"{source.name}.{timestamp}.processed" - source.rename(new_path) - logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") + new_filename = f"{source.name}.{timestamp}.processed" + + if processed_dir: + dest_path = Path(processed_dir) / new_filename + dest_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(source), str(dest_path)) + logger.info(f"Marked '{source.name}' as processed by moving to '{dest_path}'") + else: + # Fallback to old behavior: rename in place + new_path = source.parent / new_filename + source.rename(new_path) + logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") except (IOError, OSError) as e: - logger.error(f"Failed to rename processed file '{source.name}': {e}") + logger.error(f"Failed to mark processed file '{source.name}': {e}") # When using profile-aware mode, the file processing (including marking as # processed) is handled inside process_profile_task.