Update ansible works with auth and downloads on dl machine, while enforcer on master

This commit is contained in:
aperez 2026-01-11 01:21:25 +03:00
parent db78171281
commit 7a514ab8ce
45 changed files with 3377 additions and 471 deletions

View File

@ -1,7 +1,5 @@
vault_redis_password: "rOhTAIlTFFylXsjhqwxnYxDChFc"
vault_postgres_password: "pgdb_pwd_A7bC2xY9zE1wV5uP"
vault_airflow_admin_password: "2r234sdfrt3q454arq45q355"
vault_flower_password: "dO4eXm7UkF81OdMvT8E2tIKFtPYPCzyzwlcZ4RyOmCsmG4qzrNFqM5sNTOT9"
vault_redis_port: 52909
vault_vnc_password: "vnc_pwd_Z5xW8cV2bN4mP7lK"
vault_ss_password_1: "UCUAR7vRO/u9Zo71nfA13c+/b1MCiJpfZJo+EmEBCfA="
vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4="

View File

@ -0,0 +1,102 @@
---
# This file is managed by include_tasks and is not a standalone playbook.
# It provides generic tasks for managing a process within a tmux session.
# Required variables:
# - tmux_session_name: The name of the tmux session.
# - working_dir: The directory where the command should be executed.
# - command_to_run: The command to execute inside the tmux session.
# - process_grep_pattern: A regex pattern to identify the process for pkill/status check.
#
# Optional variables (control flags):
# - start_process: (bool) Set to true to start the process. Default: false.
# - stop_process: (bool) Set to true to stop the process. Default: false.
# - check_status: (bool) Set to true to check and display the process status. Default: false.
- name: Ensure tmux is installed
ansible.builtin.apt:
name: tmux
state: present
become: yes
run_once: true # No need to run this on every include
- name: Stop existing tmux session
ansible.builtin.shell:
cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true"
when: stop_process | default(false) | bool
- name: Kill any orphaned processes
ansible.builtin.shell:
cmd: |
PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}')
if [ -n "$PIDS" ]; then
kill $PIDS >/dev/null 2>&1 || true
sleep 0.5
kill -9 $PIDS >/dev/null 2>&1 || true
fi
changed_when: false
when: stop_process | default(false) | bool
- name: Stop existing process before starting (makes start idempotent)
block:
- name: Stop existing tmux session
ansible.builtin.shell:
cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true"
- name: Kill any orphaned processes
ansible.builtin.shell:
cmd: |
PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}')
if [ -n "$PIDS" ]; then
kill $PIDS >/dev/null 2>&1 || true
sleep 0.5
kill -9 $PIDS >/dev/null 2>&1 || true
fi
changed_when: false
when: start_process | default(false) | bool
- name: Ensure client script is executable
ansible.builtin.file:
path: "{{ working_dir }}/bin/ytops-client"
mode: "a+x"
when: start_process | default(false) | bool
- name: Display command for tmux session
ansible.builtin.debug:
msg: "Command for tmux session '{{ tmux_session_name }}': {{ command_to_run }}"
when: start_process | default(false) | bool
- name: Start process in tmux session
ansible.builtin.shell:
cmd: |
cd {{ working_dir }}
# The command is wrapped with a final sleep to keep the tmux session alive for debugging even if the process fails.
# The actual command is run in a subshell (...) to prevent 'exec' from terminating the parent shell.
tmux new-session -d -s {{ tmux_session_name }} \
"if [ -f .env ]; then set -a; . ./.env; set +a; fi; \
COMMAND_TO_RUN='{{ command_to_run | replace("'", "'\\''") }}'; \
echo '>>> Running command:'; \
echo "\$COMMAND_TO_RUN"; \
echo '---'; \
(eval "\$COMMAND_TO_RUN") ; \
echo; echo '---'; echo 'Process exited. This tmux session will remain open for debugging.'; echo 'You can attach with: tmux attach -t {{ tmux_session_name }}'; \
sleep 3600"
when: start_process | default(false) | bool
- name: Check process status
ansible.builtin.shell:
cmd: |
echo "Process status on {{ inventory_hostname }} for session '{{ tmux_session_name }}':"
if tmux has-session -t {{ tmux_session_name }} 2>/dev/null; then
echo " - Tmux session '{{ tmux_session_name }}' is running"
ps aux | grep -E "{{ process_grep_pattern }}" | grep -v grep || echo " - No matching process found"
else
echo " - Tmux session '{{ tmux_session_name }}' is NOT running"
fi
register: status_check
changed_when: false
when: check_status | default(false) | bool
- name: Display status
ansible.builtin.debug:
msg: "{{ status_check.stdout_lines }}"
when: check_status | default(false) | bool

219
ansible/playbook-README.md Normal file
View File

@ -0,0 +1,219 @@
# Ansible Playbooks Documentation
This document provides an overview of all available playbooks, their purpose, and how to use them operationally.
## Table of Contents
1. [Deployment Workflow](#deployment-workflow)
2. [Initial Setup Playbooks](#initial-setup-playbooks)
3. [Operational Playbooks](#operational-playbooks)
4. [Monitoring and Inspection](#monitoring-and-inspection)
5. [Common Operations](#common-operations)
## Deployment Workflow
The typical deployment workflow follows these steps:
1. **Initial Setup**: Configure machines, deploy proxies, install dependencies
2. **Master Setup**: Configure Redis, MinIO, and other infrastructure services
3. **Worker Setup**: Configure auth generators and download simulators
4. **Operational Management**: Start/stop processes, monitor status, cleanup profiles
## Initial Setup Playbooks
For a complete, automated installation on fresh nodes, you can use the main `full-install` playbook:
```bash
# Run the complete installation process on all nodes
ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini
```
The steps below are for running each part of the installation manually.
### Base Installation
```bash
# Deploy base system requirements to all nodes
ansible-playbook ansible/playbook-base-system.yml -i ansible/inventory.green.ini
```
### Proxy Deployment
```bash
# Deploy shadowsocks proxies to all nodes (as defined in cluster config)
ansible-playbook ansible/playbook-proxies.yml -i ansible/inventory.green.ini
```
### Code, Environment, and Dependencies
```bash
# Sync code to all nodes
ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
# Generate .env files for all nodes
ansible-playbook ansible/playbook-stress-generate-env.yml -i ansible/inventory.green.ini
# Install dependencies on all nodes
ansible-playbook ansible/playbook-stress-install-deps.yml -i ansible/inventory.green.ini
```
## Operational Playbooks
### Master Node Services
Redis and MinIO are deployed as Docker containers during the initial setup (`playbook-full-install.yml`).
To start the policy enforcer and monitoring tmux sessions on the master node:
```bash
# Start policy enforcer and monitoring on master
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "start_enforcer=true start_monitor=true"
```
To stop processes on the master node:
```bash
# Stop ONLY the policy enforcer
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_enforcer=true"
# Stop ONLY the monitor
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_monitor=true"
# Stop BOTH the enforcer and monitor
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_sessions=true"
```
### Worker Node Processes
```bash
# Start auth generators and download simulators on all workers
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start"
# Stop all processes on workers
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop"
# Check status of all worker processes
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=status"
```
### Profile Management
```bash
# Clean up all profiles
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles"
# Clean up specific profile prefix
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1"
```
## Monitoring and Inspection
### Status Checks
```bash
# Check status of all processes on all nodes
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=status"
# Check enforcer status on master
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=check-enforcer"
# Check profile status
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=profile-status"
```
### Log Inspection
```bash
# View tmux session output on a specific node
ssh user@hostname
tmux attach -t stress-auth-user1,user2 # For auth generator
tmux attach -t stress-download-user1,user2 # For download simulator
tmux attach -t stress-enforcer # For policy enforcer on master
```
## Common Operations
### Code and Policy Updates
First, sync your local changes to the jump host from your development machine:
```bash
# Sync project to jump host
./tools/sync-to-jump.sh
```
Then, from the jump host, you can sync code or policies to the cluster nodes:
```bash
# Sync all application code (Python sources, scripts, etc.)
ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
# Sync only policies and CLI configs
ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini
```
### Adding a New Worker
1. Update `cluster.green.yml` with the new worker definition:
```yaml
workers:
new-worker:
ip: x.x.x.x
port: 22
profile_prefixes:
- "user4"
proxies:
- "sslocal-rust-1090"
```
2. Regenerate inventory:
```bash
./tools/generate-inventory.py cluster.green.yml
```
3. Run the full installation playbook, limiting it to the new worker:
```bash
ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini --limit new-worker
```
4. Start processes on the new worker:
```bash
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start" --limit new-worker
```
### Removing a Worker
1. Stop all processes on the worker:
```bash
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop" --limit worker-to-remove
```
2. Remove the worker from `cluster.green.yml`
3. Regenerate inventory:
```bash
./tools/generate-inventory.py cluster.green.yml
```
### Emergency Stop All
```bash
# Stop all processes on all nodes
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-all"
```
### Stopping Specific Nodes
To stop processes on a specific worker or group of workers, you can use the `stop-nodes` action and limit the playbook run.
```bash
# Stop all processes on a single worker (e.g., dl003)
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit dl003
# Stop all processes on all workers
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit workers
```
### Restart Enforcer and Monitoring
```bash
# Restart monitoring and enforcer on master
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring"
```

View File

@ -3,9 +3,15 @@
hosts: all
gather_facts: yes
vars_files:
- "group_vars/all/generated_vars.stress.yml" # Assumes generate-inventory.py was run with cluster.stress.yml
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
- name: Announce base system setup
ansible.builtin.debug:
msg: "Starting base system setup on {{ inventory_hostname }}"

View File

@ -20,12 +20,19 @@
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}"
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure base directories and subdirectories exist
ansible.builtin.file:
@ -41,12 +48,12 @@
- "run/docker_mount/fetched_info_jsons"
become: yes
- name: "PHASE 2.2: Import playbook to install Python dependencies"
import_playbook: playbook-stress-install-deps.yml
- name: "PHASE 2.3: Import playbook to sync local code"
- name: "PHASE 2.2: Import playbook to sync local code"
import_playbook: playbook-stress-sync-code.yml
- name: "PHASE 2.3: Import playbook to install Python dependencies"
import_playbook: playbook-stress-install-deps.yml
# -------------------------------------------------------------------------------------------------
# PHASE 3: Environment and Service Configuration
# Generates the .env file and starts the role-specific services on master and workers.
@ -55,11 +62,17 @@
import_playbook: playbook-stress-generate-env.yml
- name: "PHASE 3.2: Master Node Services Setup"
hosts: airflow_master
hosts: master
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Configure system performance and kernel settings
ansible.builtin.copy:
@ -94,6 +107,16 @@
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start master services (Redis, MinIO)
community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}"
@ -148,16 +171,63 @@
environment:
HOME: "/home/{{ ansible_user }}"
- name: "PHASE 3.3: Shared Storage Setup (s3fs)"
hosts: airflow_master:airflow_workers
- name: "PHASE 3.2a: Worker Node Services Setup"
hosts: workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Template Docker Compose file for worker services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start worker services
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
- name: "PHASE 3.3: Shared Storage Setup (s3fs)"
hosts: master:workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}"
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Mount S3 buckets via s3fs
block:
@ -176,14 +246,124 @@
mode: '0600'
become: yes
- name: Check if mount points are already mounted
ansible.builtin.shell:
cmd: "mount | grep -q '{{ item.path }}'"
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
register: mount_check
changed_when: false
failed_when: false
- name: Ensure mount point directories exist (only if not mounted)
ansible.builtin.file:
path: "{{ item.item.path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
loop: "{{ mount_check.results }}"
when: item.rc != 0
become: yes
- name: Mount S3 buckets for stress testing
ansible.posix.mount:
src: "s3fs#{{ item.bucket }}"
path: "{{ item.path }}"
fstype: fuse
opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['airflow_master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs"
opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs"
state: mounted
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
become: yes
- name: "PHASE 3.4: Import playbook to initialize Redis profiles"
import_playbook: playbook-stress-init-redis.yml
# -------------------------------------------------------------------------------------------------
# PHASE 4: Monitoring and Management Services Setup
# Starts monitoring, enforcer, and simulation processes.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 4.1: Import playbook to manage monitoring and enforcer processes"
import_playbook: playbook-stress-manage-processes.yml
- name: "PHASE 4.2: Start monitoring and enforcer services"
hosts: master
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
start_monitor: true
start_enforcer: true
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Ensure tmux is installed
ansible.builtin.apt:
name: tmux
state: present
become: yes
- name: Start profile monitoring in tmux session
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
tmux new-session -d -s stress-monitor \
"set -a && . ./.env && set +a && \
./bin/ytops-client profile list \
--auth-env sim_auth \
--download-env sim_download \
--live \
--no-blink \
--show-reasons"
when: start_monitor | default(false) | bool
- name: Start policy enforcer in tmux session
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
tmux new-session -d -s stress-enforcer \
"set -a && . ./.env && set +a && \
./bin/ytops-client policy-enforcer \
--policy policies/8_unified_simulation_enforcer.yaml \
--live"
when: start_enforcer | default(false) | bool
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"
# -------------------------------------------------------------------------------------------------
# PHASE 5: Simulation Workload Generation (Optional - can be run manually)
# These playbooks are available but not automatically started by default.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 5.1: Note about simulation workload generation"
hosts: localhost
gather_facts: no
tasks:
- name: Display note about simulation playbooks
ansible.builtin.debug:
msg: |
Simulation workload generation playbooks are available:
- ansible/playbook-stress-auth-generator.yml
- ansible/playbook-stress-download-simulation.yml
To start simulations manually, run:
ansible-playbook ansible/playbook-stress-auth-generator.yml \
-e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3"
ansible-playbook ansible/playbook-stress-download-simulation.yml \
-e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \
--limit airflow_workers[0]

View File

@ -0,0 +1,404 @@
---
# This playbook provides a complete installation for fresh nodes.
# It can install either master or worker roles, or both on the same machine.
#
# Usage examples:
# # Install everything on all nodes
# ansible-playbook ansible/playbook-full-install.yml
#
# # Install only on workers
# ansible-playbook ansible/playbook-full-install.yml --limit workers
#
# # Install only on master
# ansible-playbook ansible/playbook-full-install.yml --limit master
#
# # Install both roles on a single machine
# ansible-playbook ansible/playbook-full-install.yml --limit specific-host -e "install_master=true install_worker=true"
# -------------------------------------------------------------------------------------------------
# PHASE 1: Base System Configuration
# Ensures all nodes have the necessary base packages, user configurations, and Docker installed.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 1: Import base system setup playbook"
import_playbook: playbook-base-system.yml
# -------------------------------------------------------------------------------------------------
# PHASE 2: Generate Environment Configuration
# Creates .env files needed by all subsequent steps
# -------------------------------------------------------------------------------------------------
- name: "PHASE 2: Generate .env configuration"
import_playbook: playbook-stress-generate-env.yml
# -------------------------------------------------------------------------------------------------
# PHASE 3: Docker Network Setup
# Ensures the shared Docker network exists before building containers
# -------------------------------------------------------------------------------------------------
- name: "PHASE 3: Ensure Docker network exists"
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Create shared Docker network
community.docker.docker_network:
name: "{{ docker_network_name }}"
driver: bridge
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 4: Build yt-dlp Docker Image
# Builds the yt-dlp container from bin/ directory
# -------------------------------------------------------------------------------------------------
- name: "PHASE 4: Build yt-dlp Docker image"
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}"
- name: Ensure bin directory exists
ansible.builtin.file:
path: "{{ base_dir }}/bin"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Check if Dockerfile exists in bin directory
ansible.builtin.stat:
path: "{{ base_dir }}/bin/Dockerfile"
register: dockerfile_stat
- name: Build yt-dlp Docker image if Dockerfile exists
community.docker.docker_image:
name: yt-dlp-custom
tag: latest
source: build
build:
path: "{{ base_dir }}/bin"
pull: yes
state: present
force_source: yes
become: yes
when: dockerfile_stat.stat.exists
- name: Display message if Dockerfile not found
ansible.builtin.debug:
msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build"
when: not dockerfile_stat.stat.exists
# -------------------------------------------------------------------------------------------------
# PHASE 5: Sync Code and Install Dependencies
# Copies application code and installs Python dependencies
# -------------------------------------------------------------------------------------------------
- name: "PHASE 5.1: Sync application code"
import_playbook: playbook-stress-sync-code.yml
- name: "PHASE 5.2: Install Python dependencies"
import_playbook: playbook-stress-install-deps.yml
# -------------------------------------------------------------------------------------------------
# PHASE 6: Deploy Shadowsocks Proxies
# Configures and starts proxy services
# -------------------------------------------------------------------------------------------------
- name: "PHASE 6: Deploy proxy services"
import_playbook: playbook-proxies.yml
# -------------------------------------------------------------------------------------------------
# PHASE 7: Install bgutils
# Note: Currently bgutils is deployed on master via docker-compose
# -------------------------------------------------------------------------------------------------
- name: "PHASE 7: Install bgutils"
import_playbook: playbook-install-bgutils.yml
# -------------------------------------------------------------------------------------------------
# PHASE 8: Master-Specific Services Setup
# Starts Redis, MinIO, and other master-only services
# -------------------------------------------------------------------------------------------------
- name: "PHASE 8: Master Node Services Setup"
hosts: master
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Configure system performance and kernel settings
ansible.builtin.copy:
src: "configs/etc/sysctl.d/99-system-limits.conf"
dest: "/etc/sysctl.d/99-system-limits.conf"
owner: root
group: root
mode: '0644'
become: yes
register: sysctl_config_copy
- name: Apply sysctl settings
ansible.builtin.command: sysctl --system
become: yes
when: sysctl_config_copy.changed
- name: Ensure MinIO data directory exists
ansible.builtin.file:
path: "{{ airflow_master_dir }}/minio-data"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Template Docker Compose file for master services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_master_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start master services (Redis, MinIO)
community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
- name: Wait for MinIO service to be ready
ansible.builtin.wait_for:
host: "{{ hostvars[inventory_hostname].ansible_host }}"
port: 9000
delay: 5
timeout: 60
delegate_to: localhost
- name: Download MinIO Client (mc) if not present
ansible.builtin.command:
cmd: wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc
creates: /usr/local/bin/mc
become: yes
- name: Ensure MinIO Client (mc) is executable
ansible.builtin.file:
path: /usr/local/bin/mc
mode: '0755'
become: yes
- name: Configure mc alias for local MinIO
ansible.builtin.command: >
mc alias set local http://localhost:9000 {{ vault_s3_access_key_id }} {{ vault_s3_secret_access_key }}
become: yes
become_user: "{{ ansible_user }}"
changed_when: false
environment:
HOME: "/home/{{ ansible_user }}"
- name: Ensure S3 buckets exist in MinIO using mc
ansible.builtin.command: >
mc mb local/{{ item }}
loop:
- "stress-inputs"
- "stress-jsons"
become: yes
become_user: "{{ ansible_user }}"
register: mc_mb_result
failed_when: >
mc_mb_result.rc != 0 and
"already exists" not in mc_mb_result.stderr
changed_when: mc_mb_result.rc == 0
environment:
HOME: "/home/{{ ansible_user }}"
# -------------------------------------------------------------------------------------------------
# PHASE 9: Worker-Specific Services Setup
# Starts worker-only services if needed
# -------------------------------------------------------------------------------------------------
- name: "PHASE 9: Worker Node Services Setup"
hosts: workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Template Docker Compose file for worker services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start worker services
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 10: Shared Storage Setup (s3fs)
# Mounts S3 buckets on all nodes
# -------------------------------------------------------------------------------------------------
- name: "PHASE 10: Shared Storage Setup (s3fs)"
hosts: master:workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Mount S3 buckets via s3fs
block:
- name: Install s3fs for mounting S3 buckets
ansible.builtin.apt:
name: s3fs
state: present
become: yes
- name: Configure s3fs credentials
ansible.builtin.copy:
content: "{{ vault_s3_access_key_id }}:{{ vault_s3_secret_access_key }}"
dest: "/home/{{ ansible_user }}/.passwd-s3fs"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0600'
become: yes
- name: Check if mount points are already mounted
ansible.builtin.shell:
cmd: "mount | grep -q '{{ item.path }}'"
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
register: mount_check
changed_when: false
failed_when: false
- name: Ensure mount point directories exist (only if not mounted)
ansible.builtin.file:
path: "{{ item.item.path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
loop: "{{ mount_check.results }}"
when: item.rc != 0
become: yes
- name: Mount S3 buckets for stress testing
ansible.posix.mount:
src: "s3fs#{{ item.bucket }}"
path: "{{ item.path }}"
fstype: fuse
opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs"
state: mounted
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 11: Initialize Redis (Master Only)
# Sets up profiles and policies in Redis
# -------------------------------------------------------------------------------------------------
- name: "PHASE 11: Initialize Redis profiles"
import_playbook: playbook-stress-init-redis.yml
# -------------------------------------------------------------------------------------------------
# PHASE 12: Final Status and Next Steps
# -------------------------------------------------------------------------------------------------
- name: "PHASE 12: Installation Complete"
hosts: localhost
gather_facts: no
tasks:
- name: Display installation completion message
ansible.builtin.debug:
msg: |
========================================
Full installation complete!
========================================
Next steps:
1. Start monitoring and enforcer (on master):
ansible-playbook ansible/playbook-stress-manage-processes.yml \
-e "start_monitor=true start_enforcer=true"
2. Start auth generator (on master):
ansible-playbook ansible/playbook-stress-auth-generator.yml \
-e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3"
3. Start download simulation (on workers):
ansible-playbook ansible/playbook-stress-download-simulation.yml \
-e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \
--limit workers
4. Check status:
ansible-playbook ansible/playbook-stress-control.yml -e "action=status"
5. Monitor profiles:
ansible-playbook ansible/playbook-stress-control.yml -e "action=profile-status"

View File

@ -0,0 +1,18 @@
---
- name: "UTIL-SETUP: Install and configure bgutils container on workers"
hosts: workers
gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Note that bgutil-provider is now on master
ansible.builtin.debug:
msg: "The bgutil-provider service is now deployed on the master node via docker-compose and is no longer deployed on workers."

View File

@ -0,0 +1,165 @@
---
- name: "UTIL-SETUP: Install media cleanup script and configure cron"
hosts: all
gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure cleanup script directory exists
ansible.builtin.file:
path: "{{ base_dir }}/bin"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Copy cleanup_media.py script
ansible.builtin.copy:
src: "{{ playbook_dir }}/../yt-ops-services-debug/cleanup_media.py"
dest: "{{ base_dir }}/bin/cleanup_media.py"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Install s5cmd for S3 uploads
block:
- name: Check if s5cmd is already installed
ansible.builtin.stat:
path: /usr/local/bin/s5cmd
register: s5cmd_binary
- name: Download and install s5cmd
block:
- name: Create temporary directory for s5cmd download
ansible.builtin.tempfile:
state: directory
suffix: s5cmd
register: s5cmd_temp_dir
- name: Download s5cmd
ansible.builtin.get_url:
url: "https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz"
dest: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz"
mode: '0644'
- name: Extract s5cmd
ansible.builtin.unarchive:
src: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz"
dest: "{{ s5cmd_temp_dir.path }}"
remote_src: yes
- name: Install s5cmd to /usr/local/bin
ansible.builtin.copy:
src: "{{ s5cmd_temp_dir.path }}/s5cmd"
dest: /usr/local/bin/s5cmd
mode: '0755'
remote_src: yes
- name: Clean up temporary directory
ansible.builtin.file:
path: "{{ s5cmd_temp_dir.path }}"
state: absent
when: not s5cmd_binary.stat.exists
become: yes
- name: Ensure log directory exists
ansible.builtin.file:
path: "/var/log"
state: directory
owner: root
group: root
mode: '0755'
become: yes
- name: Create wrapper script to source .env before running cleanup
ansible.builtin.copy:
content: |
#!/bin/bash
# Wrapper script to run cleanup_media.py with environment variables from .env
set -e
BASE_DIR="{{ base_dir }}"
# Source .env file if it exists
if [ -f "${BASE_DIR}/.env" ]; then
set -a
source "${BASE_DIR}/.env"
set +a
fi
# Determine cleanup mode based on environment variable or default
CLEANUP_MODE="${CLEANUP_MODE:-{{ cleanup_settings.mode | default('s3-upload') }}}"
# Run cleanup script
cd "${BASE_DIR}"
if [ "$CLEANUP_MODE" = "s3-upload" ]; then
# S3 upload mode - uploads to S3 then deletes
exec python3 "${BASE_DIR}/bin/cleanup_media.py" \
--target-dir "${BASE_DIR}/run" \
--target-dir "${BASE_DIR}/downloadfiles" \
--max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \
--log-file /var/log/cleanup_media.log \
--s3-upload \
--s3-bucket "${S3_BUCKET:-stress-media-archive}" \
--s3-prefix "archived-media/$(hostname)" \
--s5cmd-path /usr/local/bin/s5cmd
else
# Simple cleanup mode - just truncate and rename
exec python3 "${BASE_DIR}/bin/cleanup_media.py" \
--target-dir "${BASE_DIR}/run" \
--target-dir "${BASE_DIR}/downloadfiles" \
--max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \
--log-file /var/log/cleanup_media.log
fi
dest: "{{ base_dir }}/bin/cleanup_media_wrapper.sh"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Configure cron job for media cleanup
ansible.builtin.cron:
name: "Media cleanup - {{ base_dir }}"
minute: "0"
hour: "*"
job: "{{ base_dir }}/bin/cleanup_media_wrapper.sh 2>&1 | logger -t cleanup_media"
user: "{{ ansible_user }}"
state: "{{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }}"
become: yes
- name: Display installation summary
ansible.builtin.debug:
msg: |
Media cleanup script installed successfully on {{ inventory_hostname }}
Configuration from cluster.green.yml:
- Base directory: {{ base_dir }}
- Enabled: {{ cleanup_settings.enabled | default(true) }}
- Cleanup mode: {{ cleanup_settings.mode | default('s-upload') }}
- Max age: {{ cleanup_settings.max_age_seconds | default(3600) }} seconds
- Cron schedule: Every hour (0 * * * *)
- Cron job state: {{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }}
- Log file: /var/log/cleanup_media.log
Note: You can override the cleanup mode for a single run by setting the
CLEANUP_MODE environment variable before executing the wrapper script.
e.g., CLEANUP_MODE=cleanup {{ base_dir }}/bin/cleanup_media_wrapper.sh
To view logs:
tail -f /var/log/cleanup_media.log

View File

@ -1,7 +1,17 @@
---
- name: Deploy Shadowsocks-Rust Proxy Configurations
hosts: all
hosts: workers
gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Deploy Shadowsocks-Rust proxy services
block:
@ -55,11 +65,18 @@
path: /srv/shadowsocks-rust/docker-compose.yaml
state: absent
- name: Force stop and remove known proxy containers to prevent conflicts
community.docker.docker_container:
name: "{{ item.key }}"
state: absent
- name: Find and stop any container using the target proxy ports
ansible.builtin.shell:
cmd: |
container_id=$(docker ps -aq --filter "publish={{ item.value.local_port }}")
if [ -n "$container_id" ]; then
echo "Found container ${container_id} using port {{ item.value.local_port }}. Stopping and removing it."
docker stop "${container_id}" >/dev/null 2>&1 || true
docker rm -f "${container_id}" >/dev/null 2>&1 || true
fi
loop: "{{ shadowsocks_proxies | dict2items }}"
register: stop_conflicting_containers
changed_when: "'Stopping and removing it' in stop_conflicting_containers.stdout"
loop_control:
label: "{{ item.key }}"

View File

@ -0,0 +1,95 @@
---
- name: "STRESS-SETUP: Manage auth simulation generator"
hosts: workers
gather_facts: no
vars:
tmux_session_auth_gen: "stress-auth-{{ (profile_prefix | default('default')) | replace(',', '-') }}"
auth_policy: "policies/12_queue_auth_simulation.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_worker_dir }}"
- name: Validate profile_prefix is provided when starting or stopping
ansible.builtin.fail:
msg: "profile_prefix is required when start_generator=true or stop_generator=true"
when:
- (start_generator | default(false) | bool or stop_generator | default(false) | bool)
- profile_prefix is not defined
- name: "Display policy being used for auth generator"
ansible.builtin.debug:
msg: "Using auth generator policy: {{ auth_policy }}"
when: start_generator | default(false) | bool
- name: Manage auth generator process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_auth_gen }}"
working_dir: "{{ base_dir }}"
command_to_run: >
./bin/ytops-client stress-policy
--policy {{ auth_policy }}
{% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %}
{% if auth_min_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_min_seconds={{ auth_min_seconds }}'{% endif %}
{% if auth_max_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_max_seconds={{ auth_max_seconds }}'{% endif %}
{% if batch_size is defined %}--set 'queue_policy.batch_size={{ batch_size }}'{% endif %}
{% if create_download_tasks is defined %}--set 'queue_policy.create_download_tasks={{ create_download_tasks }}'{% endif %}
{% if formats_to_download is defined %}--set 'queue_policy.formats_to_download={{ formats_to_download }}'{% endif %}
{% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %}
--profile-prefix {{ profile_prefix }}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ auth_policy }}.*--profile-prefix {{ profile_prefix }}"
start_process: "{{ start_generator | default(false) | bool }}"
stop_process: "{{ stop_generator | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"
- name: Check tmux session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_auth_gen }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_auth_gen }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_auth_gen }}' was not found. It may have exited immediately upon starting."
when: tmux_output.rc != 0
when: start_generator | default(false) | bool

View File

@ -0,0 +1,288 @@
---
- name: "STRESS-SETUP: Unified control for stress test processes"
hosts: all
gather_facts: no
vars:
# Default action is status check
action: "status"
setup_policy: "policies/6_profile_setup_policy.yaml"
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
master_only_actions:
- "check-enforcer"
- "profile-status"
- "run-command"
- "cleanup-profiles"
- "restart-monitoring"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
when: action not in master_only_actions or inventory_hostname in groups['master']
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
when: action not in master_only_actions or inventory_hostname in groups['master']
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
when: action not in master_only_actions or inventory_hostname in groups['master']
- name: Check running processes (status action)
ansible.builtin.shell:
cmd: |
echo "=== Process status on {{ inventory_hostname }} ==="
echo "1. Tmux sessions:"
tmux list-sessions 2>/dev/null || echo "No tmux sessions"
echo ""
echo "2. ytops-client processes:"
ps aux | grep -E "ytops-client.*(profile|policy-enforcer|stress-policy)" | grep -v grep || echo "No ytops-client processes"
echo ""
echo "3. Python processes related to stress test:"
ps aux | grep -E "python.*(ytops|stress)" | grep -v grep || echo "No related python processes"
register: status_output
changed_when: false
when: action == "status"
- name: Display status
ansible.builtin.debug:
msg: "{{ status_output.stdout_lines }}"
when: action == "status"
- name: Check enforcer status (check-enforcer action)
block:
- name: Check for enforcer tmux session and process
ansible.builtin.shell:
cmd: |
echo "Enforcer status on {{ inventory_hostname }}:"
if tmux has-session -t stress-enforcer 2>/dev/null; then
echo " - Tmux session 'stress-enforcer' is RUNNING."
ps aux | grep -E "ytops-client.*policy-enforcer" | grep -v grep || echo " - WARNING: Tmux session exists, but no matching process found."
else
echo " - Tmux session 'stress-enforcer' is NOT RUNNING."
fi
register: enforcer_status
changed_when: false
- name: Display enforcer status
ansible.builtin.debug:
msg: "{{ enforcer_status.stdout_lines }}"
when: action == "check-enforcer"
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Run ytops-client profile list on master (profile-status action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
timeout 10 ./bin/ytops-client profile list \
--auth-env sim_auth \
--download-env sim_download 2>&1
register: profile_list_output
changed_when: false
when:
- action == "profile-status"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Show profile list output lines
ansible.builtin.debug:
var: profile_list_output.stdout_lines
when:
- action == "profile-status"
- profile_list_output is defined
- inventory_hostname in groups['master']
- name: Run custom ytops-client command on master (run-command action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
{{ command_to_run }}
register: command_output
changed_when: false
when:
- action == "run-command"
- inventory_hostname in groups['master']
- command_to_run is defined
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Display command output
ansible.builtin.debug:
msg: "Command output:"
when:
- action == "run-command"
- command_output is defined
- name: Show command output lines
ansible.builtin.debug:
var: command_output.stdout_lines
when:
- action == "run-command"
- command_output is defined
- name: "Display policy being used for profile cleanup"
ansible.builtin.debug:
msg: "Using setup policy for cleanup: {{ setup_policy }}"
when:
- action == "cleanup-profiles"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Cleanup profiles on master (cleanup-profiles action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
./bin/ytops-client setup-profiles \
--policy {{ setup_policy }} \
--cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %}
register: cleanup_output
changed_when: false
when:
- action == "cleanup-profiles"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Display cleanup output
ansible.builtin.debug:
msg: "Cleanup output:"
when:
- action == "cleanup-profiles"
- cleanup_output is defined
- name: Show cleanup output lines
ansible.builtin.debug:
var: cleanup_output.stdout_lines
when:
- action == "cleanup-profiles"
- cleanup_output is defined
- name: Stop all stress test processes on all nodes (stop-all action)
block:
- name: Kill all tmux sessions starting with 'stress-'
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
# Failsafe: kill any lingering tmux server processes for stress sessions
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
fi
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client and related python processes
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-all"
- name: Stop processes on targeted nodes only (stop-nodes action)
block:
- name: Kill all tmux sessions starting with 'stress-' on this node
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
# Failsafe: kill any lingering tmux server processes for stress sessions
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
fi
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client and related python processes on this node
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-nodes"
- name: Restart monitoring and enforcer (restart-monitoring action)
block:
- name: Stop monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-monitor"
process_grep_pattern: "ytops-client.*profile.*list"
stop_process: true
- name: Stop enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-enforcer"
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
stop_process: true
- name: Start monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-monitor"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client profile list
--auth-env sim_auth
--download-env sim_download
--live
--no-blink
--show-reasons
process_grep_pattern: "ytops-client.*profile.*list"
start_process: true
- name: Start enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-enforcer"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client policy-enforcer
--policy {{ enforcer_policy }}
--live
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
start_process: true
when:
- action == "restart-monitoring"
- inventory_hostname == groups['master'][0]

View File

@ -0,0 +1,93 @@
---
- name: "STRESS-SETUP: Manage download simulation"
hosts: workers
gather_facts: no
vars:
tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}"
download_policy: "policies/11_direct_docker_download_simulation.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_worker_dir }}"
- name: Validate profile_prefix is provided when starting or stopping
ansible.builtin.fail:
msg: "profile_prefix is required when start_download=true or stop_download=true"
when:
- (start_download | default(false) | bool or stop_download | default(false) | bool)
- profile_prefix is not defined
- name: "Display policy being used for download simulation"
ansible.builtin.debug:
msg: "Using download simulation policy: {{ download_policy }}"
when: start_download | default(false) | bool
- name: Manage download simulation process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_download }}"
working_dir: "{{ base_dir }}"
command_to_run: >
./bin/ytops-client stress-policy
--policy {{ download_policy }}
{% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %}
{% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %}
{% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %}
{% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %}
{% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %}
--profile-prefix {{ profile_prefix }}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}"
start_process: "{{ start_download | default(false) | bool }}"
stop_process: "{{ stop_download | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions on {{ inventory_hostname }}: {{ tmux_sessions.stdout_lines }}"
- name: Check tmux session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_download }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_download }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_download }}' was not found. It may have exited immediately upon starting."
when: tmux_output.rc != 0
when: start_download | default(false) | bool

View File

@ -3,12 +3,19 @@
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}"
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Create .env file for stress test environment
ansible.builtin.template:
@ -18,15 +25,3 @@
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Ensure REDIS_PORT is set in .env file
ansible.builtin.lineinfile:
path: "{{ base_dir }}/.env"
line: "REDIS_PORT={{ redis_port }}"
regexp: "^REDIS_PORT="
state: present
create: yes
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes

View File

@ -0,0 +1,61 @@
---
- name: "STRESS-SETUP: Initialize Redis with profiles and policies"
hosts: master
gather_facts: no
vars:
setup_policy: "policies/6_profile_setup_policy.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Check if Redis is running
ansible.builtin.shell:
cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG"
register: redis_check
ignore_errors: yes
changed_when: false
- name: Ensure Redis is accessible
ansible.builtin.fail:
msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}"
when: redis_check.rc != 0
- name: Stop any running ytops-client processes on master
ansible.builtin.shell:
cmd: ps aux | grep "[y]tops-client" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
changed_when: false
- name: "Display policy being used for Redis initialization"
ansible.builtin.debug:
msg: "Using setup policy: {{ setup_policy }}"
- name: Initialize Redis profiles and policies
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
./bin/ytops-client setup-profiles \
--policy {{ setup_policy }} \
--cleanup-all
environment:
REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}"
REDIS_PORT: "{{ redis_port }}"
REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}"
register: init_result
changed_when: init_result.rc == 0
- name: Display initialization result
ansible.builtin.debug:
msg: "Redis profile initialization completed successfully"
when: init_result.rc == 0
- name: Handle initialization failure
ansible.builtin.fail:
msg: "Failed to initialize Redis profiles: {{ init_result.stderr }}"
when: init_result.rc != 0

View File

@ -3,9 +3,15 @@
hosts: all
gather_facts: yes
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
- name: Ensure python3-pip is installed
block:
- name: Install prerequisites for managing repositories
@ -27,17 +33,22 @@
become: yes
tasks:
- name: Install required Python packages
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Install required Python packages from requirements.txt
ansible.builtin.pip:
name:
- python-dotenv
- aria2p
- tabulate
- redis
- PyYAML
- aiothrift
- PySocks
state: present
requirements: "{{ base_dir }}/ytops_client/requirements.txt"
extra_args: "--ignore-installed"
become: yes
environment:
PIP_BREAK_SYSTEM_PACKAGES: "1"
- name: Explicitly install the thrift package
ansible.builtin.pip:
name: thrift
extra_args: "--ignore-installed"
become: yes
environment:
PIP_BREAK_SYSTEM_PACKAGES: "1"

View File

@ -0,0 +1,113 @@
---
- name: "STRESS-SETUP: Manage full worker lifecycle based on inventory"
hosts: workers
gather_facts: no
vars:
# Default action
action: "status" # Available actions: start, stop, status
tasks:
- name: "Start all configured generators and simulators"
when: action == "start"
block:
- name: "Set combined profile prefixes string"
ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Start single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_generator=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
{% if create_download_tasks is defined %}-e "create_download_tasks={{ create_download_tasks }}"{% endif %}
{% if formats_to_download is defined %}-e "formats_to_download={{ formats_to_download }}"{% endif %}
delegate_to: localhost
changed_when: true
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Stop all worker generators and simulators"
when: action == "stop"
block:
- name: Kill all tmux sessions starting with 'stress-' on this worker
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client processes on this worker
ansible.builtin.shell:
cmd: |
# Gracefully terminate
ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 0.5
# Force kill
ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
- name: "Check status of all configured generators and simulators"
when: action == "status"
block:
- name: "Set combined profile prefixes string"
ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Check single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "check_status=true"
-e "profile_prefix={{ combined_prefixes }}"
delegate_to: localhost
changed_when: false
when: profile_prefixes is defined and profile_prefixes | length > 0
register: auth_status_check
- name: "Display auth generator status for {{ inventory_hostname }}"
ansible.builtin.debug:
var: auth_status_check.stdout_lines
when: auth_status_check is defined
- name: "Check single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "check_status=true"
-e "profile_prefix={{ combined_prefixes }}"
delegate_to: localhost
changed_when: false
when: profile_prefixes is defined and profile_prefixes | length > 0
register: download_status_check
- name: "Display download simulator status for {{ inventory_hostname }}"
ansible.builtin.debug:
var: download_status_check.stdout_lines
when: download_status_check is defined

View File

@ -0,0 +1,111 @@
---
- name: "STRESS-SETUP: Manage tmux sessions for monitoring and enforcer"
hosts: master
gather_facts: no
vars:
tmux_session_monitor: "stress-monitor"
tmux_session_enforcer: "stress-enforcer"
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Manage monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_monitor }}"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client profile list
--auth-env sim_auth
--download-env sim_download
--live
--no-blink
--show-reasons
process_grep_pattern: "ytops-client.*profile.*list"
start_process: "{{ start_monitor | default(false) | bool }}"
stop_process: "{{ stop_sessions | default(false) | bool or stop_monitor | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: Check monitor session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_monitor }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_monitor }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_monitor }}' was not found."
when: tmux_output.rc != 0
when: start_monitor | default(false) | bool
- name: Manage enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_enforcer }}"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client policy-enforcer
--policy {{ enforcer_policy }}
--live
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
start_process: "{{ start_enforcer | default(false) | bool }}"
stop_process: "{{ stop_sessions | default(false) | bool or stop_enforcer | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: Check enforcer session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_enforcer }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_enforcer }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_enforcer }}' was not found."
when: tmux_output.rc != 0
when: start_enforcer | default(false) | bool
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"

View File

@ -2,13 +2,22 @@
- name: "STRESS-SETUP: Sync Local Code"
hosts: all
gather_facts: no
vars:
ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source"
vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}"
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure base directory exists for code sync
ansible.builtin.file:
@ -21,7 +30,7 @@
- name: Sync python packages and directories for stress testing
ansible.posix.synchronize:
src: "../ytops_client-source/{{ item }}/"
src: "{{ ytops_source_dir }}/{{ item }}/"
dest: "{{ base_dir }}/{{ item }}/"
rsync_opts:
- "--delete"
@ -42,7 +51,7 @@
- name: Sync client utility scripts and configs
ansible.posix.synchronize:
src: "../ytops_client-source/{{ item }}"
src: "{{ ytops_source_dir }}/{{ item }}"
dest: "{{ base_dir }}/{{ item }}"
perms: yes
loop:
@ -56,3 +65,4 @@
- "ytdlp.json"
become: yes
become_user: "{{ ansible_user }}"

View File

@ -0,0 +1,58 @@
---
- name: "STRESS-SETUP: Sync Policies and CLI Configs"
hosts: all
gather_facts: no
vars:
ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure policies directory exists
ansible.builtin.file:
path: "{{ base_dir }}/policies"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Sync policies directory only
ansible.posix.synchronize:
src: "{{ ytops_source_dir }}/policies/"
dest: "{{ base_dir }}/policies/"
rsync_opts:
- "--delete"
- "--exclude=.DS_Store"
- "--exclude=__pycache__"
- "--exclude='*.pyc'"
recursive: yes
perms: yes
become: yes
become_user: "{{ ansible_user }}"
- name: Sync client CLI config files
ansible.posix.synchronize:
src: "{{ ytops_source_dir }}/{{ item }}"
dest: "{{ base_dir }}/{{ item }}"
perms: yes
loop:
- "cli.auth.config"
- "cli.download.config"
become: yes
become_user: "{{ ansible_user }}"
- name: Display sync completion
ansible.builtin.debug:
msg: "Policies and CLI configs synced to {{ base_dir }}"

View File

@ -1,8 +1,16 @@
# This file is managed by Ansible for the stress test environment.
# --- Network Settings ---
REDIS_HOST={{ hostvars[groups['airflow_master'][0]].ansible_host }}
REDIS_HOST={{ hostvars[groups['master'][0]].ansible_host }}
REDIS_PORT={{ vault_redis_port }}
REDIS_PASSWORD={{ vault_redis_password }}
# --- S3 Storage Configuration ---
S3_ACCESS_KEY={{ vault_s3_delivery_access_key_id }}
S3_SECRET_KEY={{ vault_s3_delivery_secret_access_key }}
S3_ENDPOINT={{ vault_s3_delivery_endpoint }}
S3_BUCKET={{ vault_s3_delivery_bucket }}
AWS_REGION={{ vault_s3_delivery_aws_region }}
# --- Account Manager Configuration ---
ACCOUNT_ACTIVE_DURATION_MIN=7
ACCOUNT_COOLDOWN_DURATION_MIN=30

View File

@ -1,6 +1,7 @@
# Template for stress test master services
name: "stress-services"
services:
{% if inventory_hostname in groups['master'] %}
redis:
image: redis:7-alpine
container_name: stress-redis
@ -26,7 +27,7 @@ services:
command: server /data --console-address ":9001"
networks:
- {{ docker_network_name }}
{% endif %}
bgutil-provider:
image: brainicism/bgutil-ytdlp-pot-provider
container_name: bgutil-provider

View File

@ -11,7 +11,6 @@ services:
volumes:
- ./config_ssp_{{ proxy_config.local_port }}/:/etc/shadowsocks-rust/:ro
networks:
- default
- {{ docker_network_name }}
{% endfor %}

View File

@ -19,7 +19,7 @@ def generate_inventory(cluster_config, inventory_path):
f.write("# Edit cluster.yml and re-run the generator instead.\n\n")
# Master group
f.write("[airflow_master]\n")
f.write("[master]\n")
for hostname, config in cluster_config.get('master', {}).items():
line = f"{hostname} ansible_host={config['ip']}"
if 'port' in config:
@ -29,7 +29,7 @@ def generate_inventory(cluster_config, inventory_path):
f.write("\n")
# Workers group (handles case where workers are not defined)
f.write("[airflow_workers]\n")
f.write("[workers]\n")
for hostname, config in cluster_config.get('workers', {}).items():
line = f"{hostname} ansible_host={config['ip']}"
if 'port' in config:
@ -47,6 +47,11 @@ def generate_host_vars(cluster_config, host_vars_dir):
sys.exit(1)
master_ip = list(master_nodes.values())[0]['ip']
# Get global vars for aliases
global_vars = cluster_config.get('global_vars', {})
airflow_master_dir = global_vars.get('airflow_master_dir')
airflow_worker_dir = global_vars.get('airflow_worker_dir')
# Get global proxy definitions
shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {})
@ -58,6 +63,8 @@ def generate_host_vars(cluster_config, host_vars_dir):
# Per-node list of proxies to USE
worker_proxies = config.get('proxies', [])
profile_prefixes = config.get('profile_prefixes', [])
cleanup_settings = config.get('cleanup_settings')
with open(host_vars_file, 'w') as f:
f.write("---\n")
@ -65,6 +72,13 @@ def generate_host_vars(cluster_config, host_vars_dir):
f.write(f"master_host_ip: {master_ip}\n")
f.write("redis_port: 52909\n")
# Add node-specific directory aliases for template compatibility
# The master path is needed by all nodes for the .env template.
if airflow_master_dir:
f.write(f"airflow_master: \"{airflow_master_dir}\"\n")
if hostname in cluster_config.get('workers', {}) and airflow_worker_dir:
f.write(f"airflow_dl_worker: \"{airflow_worker_dir}\"\n")
# Write the global proxy definitions for deployment
if shadowsocks_proxies:
f.write("shadowsocks_proxies:\n")
@ -81,6 +95,22 @@ def generate_host_vars(cluster_config, host_vars_dir):
for proxy in worker_proxies:
f.write(f" - \"{proxy}\"\n")
# Write worker-specific profile prefixes
if profile_prefixes:
f.write("profile_prefixes:\n")
for prefix in profile_prefixes:
f.write(f" - \"{prefix}\"\n")
# Write worker-specific cleanup settings (overrides global)
if cleanup_settings:
f.write("cleanup_settings:\n")
if 'enabled' in cleanup_settings:
f.write(f" enabled: {str(cleanup_settings['enabled']).lower()}\n")
if 'mode' in cleanup_settings:
f.write(f" mode: \"{cleanup_settings['mode']}\"\n")
if 'max_age_seconds' in cleanup_settings:
f.write(f" max_age_seconds: {cleanup_settings['max_age_seconds']}\n")
def generate_group_vars(cluster_config, group_vars_path):
"""Generate group-level variables"""
# Create parent directory if it doesn't exist
@ -107,11 +137,11 @@ def generate_group_vars(cluster_config, group_vars_path):
generated_data = {
'master_host_ip': master_ip,
'redis_port': 52909,
'external_access_ips': external_ips if external_ips else [],
'hostvars': all_nodes
'external_access_ips': external_ips if external_ips else []
}
generated_data.update(global_vars)
with open(group_vars_path, 'w') as f:
f.write("---\n")
f.write("# This file is auto-generated by tools/generate-inventory.py\n")

View File

@ -17,7 +17,8 @@ settings:
urls_file: "inputfiles/urls.rt300.txt"
# The save directory MUST be inside the docker_host_mount_path for the download
# simulation to be able to find the files.
save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation"
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
@ -88,6 +89,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" # The mount point inside the container
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_cache_path: ".cache/direct_docker_simulation"
# Path inside the container where the cache is mounted. Should match HOME/.cache
docker_container_cache_path: "/config/.cache"

View File

@ -15,7 +15,8 @@ settings:
# This directory should contain info.json files generated by an auth simulation,
# like `10_direct_docker_auth_simulation`.
# It MUST be inside the docker_host_mount_path.
info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation"
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
#info_json_dir: "run/docker_mount/download_tasks"
# Regex to extract the profile name from a task filename. The first capture
# group is used. This is crucial for the task-first locking strategy.
@ -30,6 +31,8 @@ execution_control:
workers: 1
- profile_prefix: "user2"
workers: 1
- profile_prefix: "user3"
workers: 1
# How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1
@ -86,6 +89,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config"
# Path on the HOST where downloaded files will be saved.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_download_path: "downloaded_media/direct_docker_simulation"
# Path inside the CONTAINER where `docker_host_download_path` is mounted.
docker_container_download_path: "/downloads"

View File

@ -1,27 +1,82 @@
# Policy: Queue-based Authentication Simulation via Direct Docker Exec
# Policy: Queue-based Authentication Simulation
#
# This policy simulates a continuous stream of info.json fetch requests using
# the 'direct_docker_cli' mode. It pulls URLs from a Redis queue, creates a
# temporary batch file, and then calls a yt-dlp command inside a running
# Docker container.
# This policy simulates a continuous stream of info.json fetch requests. It pulls
# URLs from a Redis queue and processes them, acting as the first stage in a
# two-stage simulation. The second stage (downloading) can be handled in one
# of two ways, configured below:
#
# --- WORKFLOW 1: Queue-Auth -> File-Download ---
# - This policy creates info.json files in a shared directory (`save_info_json_dir`).
# - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml)
# watches that directory, picks up the files, and performs the downloads.
# - To enable:
# - Set `create_download_tasks: false`
# - Ensure `save_info_json_dir` points to a shared path.
#
# --- WORKFLOW 2: Queue-Auth -> Queue-Download ---
# - This policy creates download *tasks* and pushes them to another Redis queue.
# - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml)
# pulls tasks from that queue and performs the downloads.
# - To enable:
# - Set `create_download_tasks: true`
# - Configure `download_task_queue` to the correct queue name.
# - Use `download_task_granularity` to control if one task is created per-URL
# or per-format.
#
name: 12_queue_auth_simulation
settings:
mode: fetch_only
orchestration_mode: direct_docker_cli
orchestration_mode: queue_auth
profile_mode: from_pool_with_lock
# The save directory MUST be inside the docker_host_mount_path.
save_info_json_dir: "run/docker_mount/fetched_info_jsons/queue_simulation"
# For Queue-Auth -> File-Download workflow: Directory to save generated info.json files.
# A file-based download worker (e.g., policy 11) will watch this directory.
# This directory MUST be inside the docker_host_mount_path.
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
execution_control:
# Define worker pools for multiple user groups
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
- profile_prefix: "user3"
workers: 1
# How long a worker should pause if it cannot find an available profile to lock.
worker_polling_interval_seconds: 1
# No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.
info_json_generation_policy:
profile_prefix: "user1"
# This setting tells the auth worker how many download tasks will be generated
# per successful info.json. It is used to correctly increment the
# 'pending_downloads' counter on the auth profile.
# Can be an integer, or 'from_download_policy' to automatically count formats
# from the 'download_policy.formats' setting in this same policy file.
downloads_per_url: "from_download_policy"
# (For Queue-Download workflow) Controls how download tasks are created.
#
# "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'.
# If `formats_to_download` is "140,299", two download tasks are created, and
# the 'pending_downloads' counter is incremented by 2.
#
# "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download'
# string is passed to the download worker as the format selector, but 'pending_downloads'
# is only incremented by 1 for the whole URL.
#
# --- Current Setting ---
download_task_granularity: "per_format"
# --- Alternative Setting (commented out) ---
# download_task_granularity: "per_url"
# profile_prefix is now defined per-pool in execution_control.worker_pools
# However, for queue auth mode, we need a fallback prefix
profile_prefix: "user"
# This section is needed for the 'downloads_per_url: from_download_policy' setting.
# It should mirror the formats being used by the download simulation.
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
direct_docker_cli_policy:
# Which simulation environment's profiles to use for locking.
@ -50,6 +105,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" # The mount point inside the container
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_cache_path: ".cache/queue_auth_simulation"
# Path inside the container where the cache is mounted. Should match HOME/.cache
docker_container_cache_path: "/config/.cache"
@ -109,18 +165,47 @@ direct_docker_cli_policy:
# Template for renaming the final info.json.
rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
dummy_simulation_settings:
# Timings for dummy auth simulation (per-URL delay in a batch)
auth_min_seconds: 0.1
auth_max_seconds: 0.5
auth_failure_rate: 0.0
auth_skipped_failure_rate: 0.0
# Timings for dummy download simulation (per-format download time)
download_min_seconds: 1.0
download_max_seconds: 3.0
download_failure_rate: 0.0
download_skipped_failure_rate: 0.0
queue_policy:
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox').
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox').
use_env_prefix: false
# If specified, create download tasks for these formats
# Can be "all", a specific format ID, or a list of format IDs
formats_to_download: "140-dashy/140-dashy-0/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
# Queue to pull URLs from
input_queue: "queue2_auth_inbox"
# --- Download Handoff Configuration ---
# Set to 'true' for Queue-Auth -> Queue-Download workflow.
# Set to 'false' for Queue-Auth -> File-Download workflow.
create_download_tasks: false
# Queue to push download tasks to (if create_download_tasks is true)
download_task_queue: "queue2_dl_inbox"
# How many tasks a worker should pull from the queue at once.
# This will become the batch size for the docker run.
batch_size: 25
batch_size: 5
# If specified, create download tasks for these formats
# Can be "all", a specific format ID, or a list of format IDs
# Defaults to the formats in download_policy.formats
# Example: formats_to_download: "140-dashy,299-dashy"
# Example: formats_to_download: "all"
# Example: formats_to_download: ["140-dashy", "299-dashy"]
formats_to_download: "from_download_policy"
simulation_parameters:
auth_env: "sim_auth"

View File

@ -1,16 +1,22 @@
# Policy: Queue-based Download Simulation via Direct Docker Exec
# Policy: Queue-based Download Simulation
#
# This policy simulates a continuous stream of downloads using the
# 'direct_docker_cli' mode with `mode: download_only`. It pulls download
# tasks from a Redis queue, each containing a path to an info.json file,
# and invokes a yt-dlp command inside a running Docker container to perform
# the download.
# This policy simulates a continuous stream of downloads. It pulls download tasks
# from a Redis queue, where each task typically contains a path to an info.json
# file and a format to download.
#
# This policy is designed to be the *second stage* of a two-stage simulation,
# consuming tasks produced by an authentication simulation like:
# - `12_queue_auth_simulation.yaml` (when configured for Queue-Download workflow)
#
# It does not matter to this policy whether the auth stage created tasks per-URL
# or per-format; this worker will simply process whatever task it receives from
# the `input_queue`.
#
name: 13_queue_download_simulation
settings:
mode: download_only
orchestration_mode: direct_docker_cli
orchestration_mode: queue_download
profile_mode: from_pool_with_lock
# In queue mode, info_json_dir is not used to find tasks.
# However, the paths inside the download tasks must be accessible
@ -19,12 +25,19 @@ settings:
# can be specified in the download task.
execution_control:
workers: 4
# Define worker pools for multiple user groups
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
- profile_prefix: "user3"
workers: 1
# How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1
download_policy:
profile_prefix: "user1"
# profile_prefix is now defined per-pool in execution_control.worker_pools
# Default cooldown in seconds if not specified by the enforcer in Redis.
# The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy)
# will always take precedence. This is a fallback.
@ -65,6 +78,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config"
# Path on the HOST where downloaded files will be saved.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_download_path: "downloaded_media/queue_downloads"
# Path inside the CONTAINER where `docker_host_download_path` is mounted.
docker_container_download_path: "/downloads"
@ -95,11 +109,36 @@ direct_docker_cli_policy:
- "Invalid data found when processing input"
- "Error opening input files"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
dummy_simulation_settings:
# Timings for dummy download simulation (per-format download time)
download_min_seconds: 1.0
download_max_seconds: 3.0
download_failure_rate: 0.0
download_skipped_failure_rate: 0.0
queue_policy:
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_dl_inbox').
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_download_queue2_dl_inbox').
use_env_prefix: false
# Queue to pull download tasks from
input_queue: "queue2_dl_inbox"
# Whether to report completion back to a queue
# Can be true (report all), false (report none), or "success_only"/"failure_only"
report_completion: true
# Queue to report completion to
completion_queue: "queue2_dl_completed"
# Queue to report failures to (always reported regardless of report_completion)
failure_queue: "queue2_dl_fail"
# Queue to report skipped tasks to
skipped_queue: "queue2_dl_skipped"
# How many tasks to process in a batch. For downloads, this should be 1,
# as each worker locks a profile for a single download task.
batch_size: 1

View File

@ -13,13 +13,13 @@ auth_profile_setup:
cleanup_before_run: true
pools:
- prefix: "user1"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1088:1088"
count: 3
- prefix: "user2"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1085:1085"
count: 3
- prefix: "user3"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1084:1084"
count: 3
# --- Profile setup for the DOWNLOAD simulation ---
@ -28,11 +28,11 @@ download_profile_setup:
cleanup_before_run: true
pools:
- prefix: "user1"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1088:1088"
count: 3
- prefix: "user2"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1085:1085"
count: 3
- prefix: "user3"
proxy: "sslocal-rust-1092:1092"
proxy: "sslocal-rust-1084:1084"
count: 3

View File

@ -16,13 +16,6 @@ import threading
import time
from urllib.parse import urljoin
try:
import aria2p
from aria2p.utils import human_readable_bytes
import yt_dlp
except ImportError:
print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_aria_tool')
@ -173,6 +166,14 @@ def parse_aria_args_to_options(args_str):
def main_download_aria(args):
"""Main logic for the 'download-aria' command."""
try:
import aria2p
from aria2p.utils import human_readable_bytes
import yt_dlp
except ImportError:
print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
return 1
log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger to ensure our settings are applied.
for handler in logging.root.handlers[:]:
@ -405,6 +406,7 @@ def main_download_aria(args):
def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading a single URL with aria2c."""
import aria2p
if remote_dir:
aria_options['dir'] = remote_dir
logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...")
@ -532,6 +534,7 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading fragmented formats with aria2c."""
import aria2p
logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.")
fragment_base_url = target_format.get('fragment_base_url')
fragments = target_format['fragments']

View File

@ -16,12 +16,6 @@ import sys
import time
from datetime import datetime
try:
import yt_dlp
from yt_dlp.utils import match_filter_func
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_native_py_tool')
@ -110,6 +104,8 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args):
Returns a tuple: (success: bool, ytdlp_logger: YTDLPLogger)
"""
import yt_dlp
# Deep copy info_data so we can modify it without affecting other downloads
local_info_data = copy.deepcopy(info_data)
@ -178,6 +174,13 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args):
def main_download_native_py(args):
"""Main logic for the 'download-native-py' command."""
try:
import yt_dlp
from yt_dlp.utils import match_filter_func
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
return 1
# All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer.
log_stream = sys.stderr
log_level = logging.DEBUG if args.verbose else logging.INFO

View File

@ -44,6 +44,7 @@ class PolicyEnforcer:
self.manager = manager
self.dry_run = dry_run
self.actions_taken_this_cycle = 0
self._last_wait_log_message = ""
PROXY_REST_REASON = "Proxy resting"
@ -248,7 +249,19 @@ class PolicyEnforcer:
# This prevents a deadlock if all groups are just 'Working' but none are in the 'waiting_downloads' state yet.
if not is_any_group_idle and groups_currently_waiting:
is_system_blocked_by_downloads = True
logger.info(f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free.")
log_message = f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free."
if log_message != self._last_wait_log_message:
if self._last_wait_log_message:
print(file=sys.stderr) # Newline if we were printing dots
logger.info(log_message)
self._last_wait_log_message = log_message
else:
print(".", end="", file=sys.stderr, flush=True)
else:
# If we are no longer blocked, reset the message tracker
if self._last_wait_log_message:
print(file=sys.stderr) # Newline to clean up after dots
self._last_wait_log_message = ""
if is_system_blocked_by_downloads:
# When blocked, we only want to consider profiles that are in the 'waiting_downloads' state,
@ -328,6 +341,7 @@ class PolicyEnforcer:
# The final list to check is the sorted ready profiles, followed by the not-ready ones.
not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', ''))))
profiles_to_check = sorted_ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
else: # Default 'longest_idle' sort
if strategy not in ['longest_idle']:
@ -352,6 +366,7 @@ class PolicyEnforcer:
# The final list to check will process all ready profiles first, then wait for the not-ready ones.
profiles_to_check = ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
# --- End New Sorting Logic ---
# --- New logic: Identify groups with waiting profiles ---
@ -382,7 +397,7 @@ class PolicyEnforcer:
group_name = profile_to_group_map.get(profile_name)
# --- New check to prevent activating profiles from a waiting group ---
if group_name in waiting_group_names and profile.get('rest_reason') != 'waiting_downloads':
if group_name in waiting_group_names:
logger.debug(f"Profile '{profile_name}' activation deferred because its group '{group_name}' is waiting for downloads to complete.")
continue
# --- End new logic ---

View File

@ -102,10 +102,23 @@ class ProfileManager:
self.redis.ping()
logger.info(f"Successfully connected to Redis.")
logger.info(f"Using key prefix: {key_prefix}")
self._last_lock_warning = ""
except redis.exceptions.ConnectionError as e:
logger.error(f"Failed to connect to Redis at {redis_host}:{redis_port}: {e}")
sys.exit(1)
def _log_lock_warning(self, message: str):
"""Logs a lock-related warning, printing dots for repeated messages."""
if message == self._last_lock_warning:
# Use print to avoid logger formatting and newlines
print(".", end="", file=sys.stderr, flush=True)
else:
# If we were printing dots, start a new line before the new message
if self._last_lock_warning:
print(file=sys.stderr)
logger.warning(message)
self._last_lock_warning = message
def _profile_key(self, profile_name: str) -> str:
"""Get Redis key for a profile."""
return f"{self.key_prefix}profile:{profile_name}"
@ -288,11 +301,13 @@ class ProfileManager:
'tolerated_error_count': '0',
'download_count': '0',
'download_error_count': '0',
'predicted_download_count': '0',
'global_success_count': '0',
'global_failure_count': '0',
'global_tolerated_error_count': '0',
'global_download_count': '0',
'global_download_error_count': '0',
'global_predicted_download_count': '0',
'lock_timestamp': '0',
'lock_owner': '',
'rest_until': '0',
@ -328,10 +343,10 @@ class ProfileManager:
# Convert numeric fields
numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count',
'tolerated_error_count', 'download_count', 'download_error_count',
'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count',
'global_success_count', 'global_failure_count',
'global_tolerated_error_count', 'global_download_count',
'global_download_error_count',
'global_download_error_count', 'global_predicted_download_count',
'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at']
for field in numeric_fields:
if field in data:
@ -388,10 +403,10 @@ class ProfileManager:
# --- End batch fetch ---
numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count',
'tolerated_error_count', 'download_count', 'download_error_count',
'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count',
'global_success_count', 'global_failure_count',
'global_tolerated_error_count', 'global_download_count',
'global_download_error_count',
'global_download_error_count', 'global_predicted_download_count',
'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at']
for i, data in enumerate(all_profile_data):
@ -528,8 +543,12 @@ class ProfileManager:
return total_deleted
def record_activity(self, name: str, activity_type: str,
timestamp: Optional[float] = None) -> bool:
"""Record activity (success/failure) for a profile."""
timestamp: Optional[float] = None, is_dummy: bool = False) -> bool:
"""
Record activity (success/failure) for a profile.
If is_dummy is True, the activity will NOT be recorded for the associated proxy,
preventing dummy failures from triggering proxy-level enforcer actions.
"""
if activity_type not in ['success', 'failure', 'tolerated_error', 'download', 'download_error']:
logger.error(f"Invalid activity type: {activity_type}")
return False
@ -558,7 +577,8 @@ class ProfileManager:
# Keep only last 1000 activities to prevent unbounded growth
self.redis.zremrangebyrank(activity_key, 0, -1001)
# Also record activity for the proxy
# Also record activity for the proxy, BUT NOT for dummy activities.
if not is_dummy:
proxy_url = profile.get('proxy')
if proxy_url:
proxy_activity_key = self._proxy_activity_key(proxy_url, activity_type)
@ -569,7 +589,36 @@ class ProfileManager:
pipe.execute()
logger.debug(f"Recorded {activity_type} for proxy '{proxy_url}'")
logger.debug(f"Recorded {activity_type} for profile '{name}' at {ts}")
log_msg = f"Recorded {activity_type} for profile '{name}' at {ts}"
if is_dummy:
log_msg += " (dummy, proxy activity skipped)"
logger.debug(log_msg)
return True
def record_predicted_downloads(self, name: str, count: int, is_dummy: bool = False) -> bool:
"""Records the number of download tasks predicted/created for a profile."""
if count <= 0:
return False
profile = self.get_profile(name)
if not profile:
logger.error(f"Profile '{name}' not found")
return False
ts = time.time()
# Update counters in profile
profile_key = self._profile_key(name)
self.redis.hincrby(profile_key, 'predicted_download_count', count)
self.redis.hincrby(profile_key, 'global_predicted_download_count', count)
# Update last_used
self.redis.hset(profile_key, 'last_used', str(ts))
log_msg = f"Recorded {count} predicted downloads for profile '{name}'"
if is_dummy:
log_msg += " (dummy, proxy activity skipped)"
logger.debug(log_msg)
return True
def get_activity_rate(self, name: str, activity_type: str,
@ -612,6 +661,7 @@ class ProfileManager:
'tolerated_error_count': '0',
'download_count': '0',
'download_error_count': '0',
'predicted_download_count': '0',
}
self.redis.hset(profile_key, mapping=counters_to_reset)
logger.info(f"Reset session counters for profile '{name}'.")
@ -630,19 +680,21 @@ class ProfileManager:
total_tolerated_error = sum(int(p.get('global_tolerated_error_count', 0)) for p in profiles)
total_downloads = sum(int(p.get('global_download_count', 0)) for p in profiles)
total_download_errors = sum(int(p.get('global_download_error_count', 0)) for p in profiles)
total_predicted_downloads = sum(int(p.get('global_predicted_download_count', 0)) for p in profiles)
return {
'total_success': total_success,
'total_failure': total_failure,
'total_tolerated_error': total_tolerated_error,
'total_downloads': total_downloads,
'total_download_errors': total_download_errors,
'total_predicted_downloads': total_predicted_downloads,
}
def get_per_proxy_stats(self) -> Dict[str, Dict[str, Any]]:
"""Get aggregated stats per proxy."""
profiles = self.list_profiles()
proxy_stats = collections.defaultdict(lambda: {
'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'profiles': 0
'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'predicted_downloads': 0, 'profiles': 0
})
for p in profiles:
proxy = p.get('proxy')
@ -652,6 +704,7 @@ class ProfileManager:
proxy_stats[proxy]['tolerated_error'] += int(p.get('global_tolerated_error_count', 0))
proxy_stats[proxy]['downloads'] += int(p.get('global_download_count', 0))
proxy_stats[proxy]['download_errors'] += int(p.get('global_download_error_count', 0))
proxy_stats[proxy]['predicted_downloads'] += int(p.get('global_predicted_download_count', 0))
proxy_stats[proxy]['profiles'] += 1
return dict(proxy_stats)
@ -862,14 +915,14 @@ class ProfileManager:
# Original logic: find all active profiles, optionally filtered by prefix.
active_profiles = self.redis.zrange(self._state_key(ProfileState.ACTIVE.value), 0, -1)
if not active_profiles:
logger.warning("No active profiles available to lock.")
self._log_lock_warning("No active profiles available to lock.")
self.redis.incr(self._failed_lock_attempts_key())
return None
if profile_prefix:
profiles_to_check = [p for p in active_profiles if p.startswith(profile_prefix)]
if not profiles_to_check:
logger.warning(f"No active profiles with prefix '{profile_prefix}' available to lock.")
self._log_lock_warning(f"No active profiles with prefix '{profile_prefix}' available to lock.")
self.redis.incr(self._failed_lock_attempts_key())
return None
else:
@ -883,9 +936,9 @@ class ProfileManager:
if not full_profiles:
if specific_profile_name:
logger.warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).")
self._log_lock_warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).")
else:
logger.warning("No active profiles available to lock after filtering.")
self._log_lock_warning("No active profiles available to lock after filtering.")
self.redis.incr(self._failed_lock_attempts_key())
return None
@ -898,7 +951,7 @@ class ProfileManager:
]
if not eligible_profiles:
logger.warning("No active profiles with an active proxy available to lock.")
self._log_lock_warning("No active profiles with an active proxy available to lock.")
self.redis.incr(self._failed_lock_attempts_key())
return None
@ -922,7 +975,7 @@ class ProfileManager:
if not current_state or current_state.upper() != ProfileState.ACTIVE.value:
# Another process (enforcer) changed the state. Release lock and try next.
self.redis.hdel(locks_key, name)
logger.warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.")
self._log_lock_warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.")
continue
# State is still ACTIVE, proceed with locking.
@ -937,6 +990,7 @@ class ProfileManager:
sm.lock(owner=owner)
# The on_enter_locked action handles all Redis updates for the profile itself.
# The logger messages are also in the action.
self._last_lock_warning = "" # Reset on successful lock
return self.get_profile(name)
except Exception as e:
# This could be a TransitionNotAllowed error if the state changed,
@ -946,7 +1000,7 @@ class ProfileManager:
self.redis.hdel(locks_key, name)
continue
logger.warning("Could not lock any active profile (all may have been locked by other workers).")
self._log_lock_warning("Could not lock any active profile (all may have been locked by other workers).")
self.redis.incr(self._failed_lock_attempts_key())
return None
@ -1016,45 +1070,15 @@ class ProfileManager:
# the `on_enter` actions for the current state. We can suppress the initial transition
# and set the state directly.
# WORKAROUND for older statemachine library:
# Instantiating the machine triggers an initial transition to ACTIVE, which wrongly updates Redis.
# We let this happen, and then immediately correct the state if it was supposed to be something else.
sm = ProfileStateMachine(manager=self, profile_name=name)
# The sm is now in ACTIVE state, and Redis has been updated. If the original state was
# LOCKED, we must re-lock it to fix Redis and the state machine object so transitions work.
if current_state_str == ProfileState.LOCKED.value:
lock_owner = profile.get('lock_owner', 're-lock-owner')
try:
# This transition ensures the `on_enter_LOCKED` actions are run, making the
# state consistent in Redis and in the state machine object.
sm.lock(owner=lock_owner)
except Exception as e:
logger.error(f"Failed to re-lock profile '{name}' during state machine hydration: {e}")
# The state is now inconsistent, best to not return a broken machine.
return None
elif current_state_str != sm.current_state.value.upper():
# For any other state, we must manually fix both the state machine object and Redis,
# as the constructor wrongly transitioned to ACTIVE.
# 1. Force state on the machine object. This does not trigger actions.
target_state_obj = next((s for s in sm.states if s.value.upper() == current_state_str), None)
if not target_state_obj:
logger.error(f"Could not find state object for '{current_state_str}' during hydration of '{name}'.")
return None
sm.current_state = target_state_obj
# 2. Manually revert the state in Redis to what it should be.
profile_key = self._profile_key(name)
pipe = self.redis.pipeline()
pipe.hset(profile_key, 'state', current_state_str)
# Atomically move the profile from the incorrect ACTIVE index to the correct one.
# The constructor may have added it to ACTIVE without removing it from its original state index.
pipe.zrem(self._state_key(ProfileState.ACTIVE.value), name)
pipe.zadd(self._state_key(current_state_str), {name: profile.get('last_used', time.time())})
pipe.execute()
logger.debug(f"Corrected state for '{name}' to '{current_state_str}' in object and Redis during hydration.")
# By passing `initial_value`, we tell the state machine to start in the
# profile's actual current state from Redis, instead of executing the
# default `initial=True` transition to ACTIVE. This prevents incorrect
# state changes and logs during "hydration" of the state machine.
sm = ProfileStateMachine(
manager=self,
profile_name=name,
initial_value=current_state_str
)
return sm
def cleanup_stale_locks(self, max_lock_time_seconds: int) -> int:
@ -1475,36 +1499,77 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co
next_up_reason = f"least_loaded (load: {load}, {finish_str})"
elif profile_selection_strategy == 'longest_idle':
# Find the single longest idle profile across all groups
ready_profiles = []
# Find groups that don't have an active profile
groups_without_active = []
for group in profile_groups_config:
profiles_in_group = group.get('profiles_in_group', [])
has_active = False
for p_name in profiles_in_group:
p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
has_active = True
break
if not has_active:
groups_without_active.append(group)
# If all groups have active profiles, we need to find which one will be rotated next
# For now, let's find the group with the profile that has been used the longest
if not groups_without_active:
# Find the active profile with the highest request count (closest to rotation)
active_profiles_info = []
for group in profile_groups_config:
for p_name in group.get('profiles_in_group', []):
p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
# Calculate total requests
total_reqs = (
p.get('success_count', 0) + p.get('failure_count', 0) +
p.get('tolerated_error_count', 0) +
p.get('download_count', 0) + p.get('download_error_count', 0)
)
active_profiles_info.append({
'profile': p,
'group': group,
'total_reqs': total_reqs,
'last_used': p.get('last_used', 0)
})
if active_profiles_info:
# Sort by total requests descending (highest first - closest to rotation)
# Then by last_used ascending (oldest first)
active_profiles_info.sort(key=lambda x: (-x['total_reqs'], x['last_used']))
# The first one is most likely to be rotated next
# Find which group should be activated after it
# For simplicity, let's just indicate the current active group
# But this is not ideal
pass
# Find the longest idle profile from groups without active profiles
ready_profiles = []
for group in groups_without_active:
for p_name in group.get('profiles_in_group', []):
p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value] and p.get('rest_until', 0) <= now and p.get('rest_reason') != 'waiting_downloads':
ready_profiles.append(p)
ready_profiles.append((p, group))
if ready_profiles:
# Sort them according to the 'longest_idle' activation logic
unused_profiles = [p for p in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0]
used_profiles = [p for p in ready_profiles if p not in unused_profiles]
unused_profiles = [(p, g) for p, g in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0]
used_profiles = [(p, g) for p, g in ready_profiles if (p, g) not in unused_profiles]
unused_profiles.sort(key=lambda p: natural_sort_key(p.get('name', '')))
used_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', ''))))
unused_profiles.sort(key=lambda x: natural_sort_key(x[0].get('name', '')))
used_profiles.sort(key=lambda x: (x[0].get('last_used', 0), natural_sort_key(x[0].get('name', ''))))
sorted_ready_profiles = unused_profiles + used_profiles
if sorted_ready_profiles:
next_profile = sorted_ready_profiles[0]
# Find which group it belongs to
for group in profile_groups_config:
if next_profile['name'] in group.get('profiles_in_group', []):
next_up_group_name = group['name']
next_profile, next_group = sorted_ready_profiles[0]
next_up_group_name = next_group['name']
next_up_reason = profile_selection_strategy
if getattr(args, 'show_reasons', False):
last_used_ts = next_profile.get('last_used', 0)
idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used"
next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})"
break
# --- End new logic ---
for group in profile_groups_config:
@ -1672,9 +1737,11 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
if is_auth_sim:
row.extend([
p.get('success_count', 0),
p.get('predicted_download_count', 0),
p.get('failure_count', 0),
p.get('tolerated_error_count', 0),
p.get('global_success_count', 0),
p.get('global_predicted_download_count', 0),
p.get('global_failure_count', 0),
])
else: # is_download_sim or unknown
@ -1700,7 +1767,7 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
headers = ['Name', 'Proxy', 'State', 'Last Used']
if is_auth_sim:
headers.extend(['AuthOK', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.AuthFail'])
headers.extend(['AuthOK', 'DataPred', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.DataPred', 'Tot.AuthFail'])
else: # is_download_sim or unknown
headers.extend(['DataOK', 'DownFail', 'Skip.Err', 'Tot.DataOK', 'Tot.DownFail'])
@ -1870,7 +1937,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
dl_success_rate = (dl_stats['total_downloads'] / total_dls * 100) if total_dls > 0 else 100
global_summary_str = (
f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | "
f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats.get('total_predicted_downloads', 0)} Tasks, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | "
f"OK Rate: {success_rate:.2f}% | "
f"Failed Locks: {auth_failed_locks} || "
f"Download: {total_dls} attempts ({dl_stats['total_downloads']} OK, {dl_stats['total_download_errors']} Fail) | "
@ -1894,6 +1961,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
astats.get('profiles', 0),
dstats.get('profiles', 0),
astats.get('success', 0),
astats.get('predicted_downloads', 0),
astats.get('failure', 0),
astats.get('tolerated_error', 0),
dstats.get('downloads', 0),
@ -1916,7 +1984,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
if all_proxies:
print("\n--- Per-Proxy Stats (Merged) ---", file=file)
proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)']
proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'DataPred', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)']
print(tabulate(proxy_table_data, headers=proxy_headers, tablefmt='grid'), file=file)
print(f"\n--- Auth Simulation Profile Details ({args.auth_env}) ---", file=file)

View File

@ -51,10 +51,31 @@ class ProfileStateMachine(StateMachine):
pause = active.to(paused) | locked.to(paused) | resting.to(paused) | cooldown.to(paused)
def __init__(self, manager: ProfileManager, profile_name: str, *args, **kwargs):
def __init__(self, manager: ProfileManager, profile_name: str, initial_value=None):
self.manager = manager
self.profile_name = profile_name
super().__init__(*args, **kwargs)
# Call parent constructor with model=self
super().__init__(model=self)
# If initial_value is provided, set the current state without triggering transitions
if initial_value is not None:
# Convert to uppercase to match state values
initial_value = initial_value.upper()
# Check if we're not already in this state
# Compare case-insensitively to handle any discrepancies
if self.current_state.value.upper() != initial_value:
# Find the corresponding state object
target_state = None
for state in self.states:
# Compare both .value and .id case-insensitively
if state.value.upper() == initial_value or (hasattr(state, 'id') and state.id.upper() == initial_value):
target_state = state
break
if target_state:
# Set current state without triggering transitions
self.current_state = target_state
else:
# If state not found, log a warning but don't crash
logger.warning(f"Could not find state '{initial_value}' (case-insensitive) for profile '{profile_name}'. Keeping current state '{self.current_state.value}'.")
# --- Action Methods ---

View File

@ -70,12 +70,14 @@ class QueueManager:
"""Returns the number of items in a queue."""
return self.redis.llen(queue_name)
def populate(self, queue_name: str, file_path: str) -> int:
"""Populates a queue from a file (text with one item per line, or JSON with an array of strings)."""
def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int:
"""Populates a queue from a file (text with one item per line, or JSON with an array of items)."""
count = 0
if file_path.lower().endswith('.json'):
logger.info("Detected JSON file. Attempting to parse as an array of strings.")
if wrap_key:
logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.")
logger.info("Detected JSON file. Attempting to parse as an array of items.")
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
@ -83,13 +85,21 @@ class QueueManager:
logger.error("JSON file must contain a list/array.")
return 0
items_to_add = [str(item).strip() for item in data if str(item).strip()]
# Items can be strings or objects. If objects, they should be converted to JSON strings.
items_to_add = []
for item in data:
if isinstance(item, str):
items_to_add.append(item.strip())
else:
items_to_add.append(json.dumps(item))
items_to_add = [item for item in items_to_add if item]
pipe = self.redis.pipeline()
for item in items_to_add:
pipe.rpush(queue_name, item)
count += 1
if count % 1000 == 0:
if count > 0 and count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {count} items...")
pipe.execute()
@ -105,9 +115,13 @@ class QueueManager:
for line in f:
item = line.strip()
if item:
pipe.rpush(queue_name, item)
if wrap_key:
payload = json.dumps({wrap_key: item})
else:
payload = item
pipe.rpush(queue_name, payload)
count += 1
if count % 1000 == 0:
if count > 0 and count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {count} items...")
pipe.execute()
@ -118,6 +132,45 @@ class QueueManager:
logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.")
return count
def push_generated(self, queue_name: str, prefix: str, count: int) -> int:
"""Pushes generated payloads to a queue."""
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%dt%H%M')
pipe = self.redis.pipeline()
pushed_count = 0
for i in range(count):
generated_value = f"{prefix}_{timestamp}_{i:04d}"
payload = json.dumps({"url": generated_value})
pipe.rpush(queue_name, payload)
pushed_count += 1
if pushed_count > 0 and pushed_count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {pushed_count} of {count} items...")
pipe.execute()
logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.")
return pushed_count
def push_static(self, queue_name: str, payload: str, count: int) -> int:
"""Pushes a static payload multiple times to a queue."""
try:
json.loads(payload)
except json.JSONDecodeError:
logger.error(f"Invalid JSON in --payload-json: {payload}")
return 0
pipe = self.redis.pipeline()
pushed_count = 0
for _ in range(count):
pipe.rpush(queue_name, payload)
pushed_count += 1
if pushed_count > 0 and pushed_count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {pushed_count} of {count} items...")
pipe.execute()
logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.")
return pushed_count
def clear(self, queue_name: str, dump_path: Optional[str] = None) -> int:
"""Clears a queue, optionally dumping its contents to a file."""
size = self.redis.llen(queue_name)
@ -174,10 +227,17 @@ def add_queue_manager_parser(subparsers):
peek_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
peek_parser.add_argument('--count', type=int, default=10, help='Number of items to show (default: 10)')
# Populate command
populate_parser = subparsers.add_parser('populate', help='Populate a queue from a file (one item per line).', parents=[common_parser])
populate_parser.add_argument('file_path', help='Path to the file containing items to add.')
populate_parser.add_argument('--queue-name', help="Name of the queue to populate. Defaults to '<env>_stress_inbox'.")
# Push command
push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser])
push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).')
source_group = push_parser.add_mutually_exclusive_group(required=True)
source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).')
source_group.add_argument('--payload-json', help='A static JSON payload to push. Use with --count to push multiple times.')
source_group.add_argument('--generate-payload-prefix', help='Generate JSON payloads with a timestamp and counter. Example: {"url": "PREFIX_yyyymmddthhmm_0001"}. Use with --count.')
push_parser.add_argument('--wrap-file-line-in-json', metavar='KEY', help="For text files (--from-file), wrap each line in a JSON object with the specified key (e.g., 'url' -> {\"url\": \"line_content\"}).")
# Clear command
clear_parser = subparsers.add_parser('clear', help='Clear a queue, optionally dumping its contents.', parents=[common_parser])
@ -215,9 +275,9 @@ def main_queue_manager(args):
)
# For commands that operate on a single queue, set a default name based on the environment if not provided.
is_single_queue_command = args.queue_command in ['peek', 'populate', 'clear']
is_single_queue_command = args.queue_command in ['peek', 'push', 'clear']
if is_single_queue_command:
# `populate` uses an option (--queue-name), while `peek` and `clear` use a positional argument.
# `push`, `peek` and `clear` use a positional argument for queue_name.
# We check for `queue_name` attribute and if it's falsy (None or empty string).
if not getattr(args, 'queue_name', None):
default_queue_name = f"{args.env}_stress_inbox"
@ -244,11 +304,21 @@ def main_queue_manager(args):
print(f"{i+1: >3}: {item}")
return 0
elif args.queue_command == 'populate':
elif args.queue_command == 'push':
if args.file_path:
if not os.path.exists(args.file_path):
print(f"Error: File not found at '{args.file_path}'", file=sys.stderr)
return 1
manager.populate(args.queue_name, args.file_path)
if args.count > 1:
logger.warning("--count is ignored when using --from-file.")
manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json)
elif args.payload_json:
manager.push_static(args.queue_name, args.payload_json, args.count)
elif args.generate_payload_prefix:
if args.count <= 0:
print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr)
return 1
manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count)
return 0
elif args.queue_command == 'clear':

View File

@ -11,6 +11,9 @@ aria2p
# For reading .env files for configuration
python-dotenv==1.0.1
# For 'direct_docker_cli' orchestration mode in stress-policy
docker
# For SOCKS proxy support in client tools
PySocks

View File

@ -167,7 +167,7 @@ Overridable Policy Parameters via --set:
parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.')
parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.')
parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)")
parser.add_argument('--profile-prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages.")
parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.")
parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.')
parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.")

View File

@ -53,6 +53,7 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana
os.makedirs(save_dir, exist_ok=True)
last_used_profile_name = None
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
temp_batch_file = None
@ -93,13 +94,24 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana
if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}. Pausing for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}."
else:
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'. Pausing for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.info(f"{base_log_msg} Pausing for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging ---
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']

View File

@ -113,6 +113,7 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
return []
last_used_profile_name = None
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
temp_task_dir_host = None
@ -120,11 +121,25 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
# --- Variables for robust finalization ---
url_batch_len = 0
batch_started = False
downloads_per_url = 0 # Default to 0, meaning no increment unless configured
num_formats_per_url = 0
downloads_to_increment = 0
# ---
try:
# 1. Lock a profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
# 1. Lock a profile, trying any of the specified prefixes
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
# --- New logic to avoid immediate reuse ---
avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False)
@ -135,9 +150,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5)
time.sleep(wait_seconds)
# After waiting, try to lock again.
# After waiting, try to lock again, from any of the available prefixes
logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.")
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if locked_profile and locked_profile['name'] == last_used_profile_name:
logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.")
@ -153,13 +180,24 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}."
else:
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(f"{base_log_msg} Pausing for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging ---
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']
@ -237,24 +275,33 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
url_batch_len = len(url_batch)
batch_started = True
# --- Calculate how many download tasks will be generated ---
# The "pending downloads" counter for an auth profile tracks the number of
# info.json files it has generated that are waiting to be processed.
# Each successful URL fetch creates one info.json file (one task).
# The number of actual media files downloaded from that info.json is
# irrelevant to the auth profile's counter.
downloads_per_url = 0 # Default to 0, meaning no increment unless configured
# --- Calculate how many download tasks will be generated to set the counter ---
downloads_per_url_config = gen_policy.get('downloads_per_url')
if downloads_per_url_config:
downloads_per_url = 1 # We just need a flag to enable the logic, the count is per-URL.
num_formats_per_url = 0 # Default to no increment
if downloads_per_url > 0:
# Increment by the number of URLs in the batch.
downloads_to_increment = url_batch_len
profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment)
logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch of {url_batch_len} URLs.")
if downloads_per_url_config:
if isinstance(downloads_per_url_config, int):
num_formats_per_url = downloads_per_url_config
elif downloads_per_url_config == 'from_download_policy':
# Heuristic: count comma-separated groups in the format selector.
# This mirrors how yt-dlp processes multiple format downloads.
d_policy = policy.get('download_policy', {})
formats_str = d_policy.get('formats', '')
if formats_str:
# Each comma separates a group from which one format is downloaded.
num_formats_per_url = formats_str.count(',') + 1
else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured. Pending downloads counter will not be incremented for this batch.")
num_formats_per_url = 1 # fallback to 1 if formats is empty
elif downloads_per_url_config:
# Fallback for non-int, non-'from_download_policy' truthy values
num_formats_per_url = 1
if num_formats_per_url > 0:
downloads_to_increment = url_batch_len * num_formats_per_url
profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment)
logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for a batch of {url_batch_len} URLs ({num_formats_per_url} format(s)/URL).")
else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or is zero. Pending downloads counter will not be incremented for this batch.")
end_idx = start_idx + len(url_batch)
logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).")
@ -480,14 +527,15 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
logger.error(f"Error during immediate post-processing from log line: {e}")
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
if post_processed_successfully:
live_success_count += 1
logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.")
profile_manager_instance.record_activity(profile_name, 'success')
profile_manager_instance.record_activity(profile_name, 'success', is_dummy=is_dummy)
else:
live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.")
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
# --- End immediate post-processing ---
return False
@ -495,9 +543,10 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
for pattern in fatal_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
if direct_policy.get('ban_on_fatal_error_in_batch'):
logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.")
profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch')
@ -512,16 +561,18 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
for pattern in tolerated_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_tolerated_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
return False
# If it's an ERROR: line and not tolerated, it's a failure
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
return False
@ -544,14 +595,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
rand_val = random.random()
if rand_val < auth_skipped_rate:
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
elif rand_val < (auth_skipped_rate + auth_failure_rate):
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.")
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=True)
else:
# Success
files_created += 1
profile_manager_instance.record_activity(profile_name, 'success')
profile_manager_instance.record_activity(profile_name, 'success', is_dummy=True)
# Create a dummy file in the temp output dir to simulate success
dummy_output_path = Path(temp_task_dir_host) / f"{video_id}.info.json"
@ -644,13 +695,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
finally:
if locked_profile and batch_started:
# --- Reconcile pending downloads counter ---
if downloads_per_url > 0:
# We incremented by url_batch_len at the start.
# Now we adjust for any URLs that failed to produce an info.json.
# The adjustment is the number of successes minus the number of attempts.
initial_increment = url_batch_len
actual_successes = live_success_count
adjustment = actual_successes - initial_increment
if downloads_to_increment > 0:
# We incremented at the start. Now we adjust for any URLs that failed to produce an info.json.
initial_increment = downloads_to_increment
actual_successes = live_success_count # This is count of successful info.jsons
expected_downloads_from_successes = actual_successes * num_formats_per_url
adjustment = expected_downloads_from_successes - initial_increment
if adjustment != 0:
# The adjustment will be negative, effectively decrementing the counter for each failure.
@ -680,10 +732,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
if cooldown:
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.")
# For auth simulation, check if profile has pending downloads
# If so, don't apply cooldown to avoid conflicting with enforcer
profile_data = profile_manager_instance.get_profile(locked_profile['name'])
should_apply_cooldown = cooldown
if profile_data:
pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name'])
rest_reason = profile_data.get('rest_reason')
if pending_downloads > 0 or rest_reason == 'waiting_downloads':
should_apply_cooldown = None
logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.")
unlocked_successfully = profile_manager_instance.unlock_profile(
locked_profile['name'],
owner=owner_id,
rest_for_seconds=cooldown
rest_for_seconds=should_apply_cooldown
)
if not unlocked_successfully:
logger.error(f"[Worker {worker_id}] FAILED to unlock profile '{locked_profile['name']}'. The profile may be stuck.")
@ -751,7 +814,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
return []
no_task_streak = 0
last_used_profile_name = None
last_no_task_log_msg = ""
task_counter = 0
while not state_manager.shutdown_event.is_set():
locked_profile = None
@ -760,6 +823,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
was_banned_by_parser = False
task = None
task_id = None
downloads_processed_in_task = 0
try:
if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
@ -769,9 +833,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})")
base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}."
else:
logger.info(f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})")
base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging ---
time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue
@ -844,17 +917,27 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
if claimed_task_path_host:
no_task_streak = 0
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
auth_profile_name, auth_env = None, None
info_data = None
# --- Read info.json content and metadata first ---
# In queue mode, the task object is the primary source of truth for auth metadata.
# This check is conditional because in file-based mode, `task` will be None.
if task:
auth_profile_name = task.get('auth_profile_name')
auth_env = task.get('auth_env')
# --- Read info.json content and use its metadata as a fallback ---
try:
with open(claimed_task_path_host, 'r', encoding='utf-8') as f:
info_data = json.load(f)
# This is critical for decrementing the counter in the finally block
# Fallback to info.json metadata if not present in the task object.
if not auth_profile_name or not auth_env:
metadata = info_data.get('_ytops_metadata', {})
auth_profile_name = metadata.get('profile_name')
auth_env = metadata.get('auth_env')
auth_profile_name = auth_profile_name or metadata.get('profile_name')
auth_env = auth_env or metadata.get('auth_env')
except (IOError, json.JSONDecodeError) as e:
logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.")
continue # Skip to finally block to unlock profile
@ -874,7 +957,12 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
source_of_format = "task file"
if not format_selection:
format_selection = d_policy.get('formats', '')
source_of_format = "policy"
source_of_format = "policy (download_policy.formats)"
if not format_selection:
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
format_selection = ytdlp_config_overrides.get('format', '')
source_of_format = "policy (ytdlp_config_overrides.format)"
if not format_selection:
logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.")
@ -907,17 +995,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
details = f"Dummy skipped failure for format {format_id}"
error_type = "DummySkippedFailure"
is_tolerated_error = True
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
elif should_fail_fatal:
logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.")
details = f"Dummy fatal failure for format {format_id}"
error_type = "DummyFailure"
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True)
else:
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.")
success = True
details = f"Dummy success for format {format_id}"
profile_manager_instance.record_activity(profile_name, 'download')
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True)
downloads_processed_in_task += 1
event = {
'type': 'direct_docker_download',
@ -963,7 +1052,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
event = {
'type': 'direct_docker_download', 'profile': profile_name,
@ -1066,23 +1155,23 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
def log_parser_callback(line):
nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser
# Success is a high-priority check. Only record one success per task.
# Success is a high-priority check.
if '[download] 100% of' in line or 'has already been downloaded' in line:
with activity_lock:
# Only count one success per task
if live_success_count == 0:
is_dummy = args.dummy or args.dummy_batch
live_success_count += 1
logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success detected from log.")
profile_manager_instance.record_activity(profile_name, 'download')
logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success #{live_success_count} detected from log.")
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy)
return False
# Check for fatal patterns
for pattern in fatal_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
if direct_policy.get('ban_on_fatal_error_in_batch'):
logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.")
profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download')
@ -1098,16 +1187,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
for pattern in tolerated_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_tolerated_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
return False
# If it's an ERROR: line and not tolerated, it's a failure
with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
return False
@ -1149,11 +1240,11 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
final_outcome = "download"
logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.")
# We record a success here as a fallback, in case the log parser missed it.
profile_manager_instance.record_activity(profile_name, 'download')
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
else:
final_outcome = "download_error"
logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.")
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
# --- Airflow Directory Logic ---
if success and d_policy.get('output_to_airflow_ready_dir'):
@ -1231,6 +1322,9 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details }
state_manager.log_event(event)
# Store the number of detected successful downloads to use for decrementing the counter.
downloads_processed_in_task = live_success_count
logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.")
# 6. Clean up task file
@ -1274,25 +1368,34 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
try:
auth_manager.decrement_pending_downloads(auth_profile_name)
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'.")
# Fallback: if no downloads were counted (e.g., due to an early exit or parsing issue),
# but the task is being finalized, we must assume all potential downloads for this task
# are "processed" to prevent the auth profile from getting stuck.
if downloads_processed_in_task == 0:
logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.")
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', ''))
num_formats = formats_str.count(',') + 1 if formats_str else 1
downloads_processed_in_task = num_formats
logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}")
if downloads_processed_in_task > 0:
new_count = auth_manager.increment_pending_downloads(auth_profile_name, -downloads_processed_in_task)
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' by {downloads_processed_in_task} in env '{auth_env}'. New count: {new_count}")
except Exception as e:
logger.error(f"[Worker {worker_id}] Failed to decrement pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}': {e}", exc_info=True)
else:
logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
else:
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})")
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env}) Task had auth_profile_name: {task.get('auth_profile_name')}, auth_env: {task.get('auth_env')}")
if was_banned_by_parser:
logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.")
else:
last_used_profile_name = locked_profile['name']
cooldown = None
# Only apply cooldown if a task was actually claimed and processed.
if claimed_task_path_host:
# Enforcer is the only point where we configure to apply different policies,
# since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously.
# This is like applying a policy across multiple workers/machines without needing to restart each of them.
# DESIGN: The cooldown duration is not configured in the worker's policy.
# Instead, it is read from a central Redis key. This key is set by the
# policy-enforcer, making the enforcer the single source of truth for

View File

@ -46,11 +46,14 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
os.makedirs(output_dir, exist_ok=True)
no_task_streak = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
claimed_task_path = None
auth_profile_name, auth_env = None, None # For finally block
downloads_to_decrement = 0
formats_from_metadata, granularity_from_metadata = [], 'per_format'
try:
# 0. If no tasks were found, pause briefly.
if no_task_streak > 0:
@ -61,9 +64,19 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})")
base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s."
else:
logger.info(f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})")
base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr) # Newline to clean up after dots
full_log_msg = f"{base_log_msg} (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging ---
time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue
@ -83,6 +96,9 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if claimed_task_path:
no_task_streak = 0 # Reset streak
if last_no_task_log_msg:
print(file=sys.stderr) # Newline to clean up after dots
last_no_task_log_msg = ""
# --- Read metadata before processing/deleting file ---
try:
@ -91,6 +107,18 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
metadata = info_data.get('_ytops_metadata', {})
auth_profile_name = metadata.get('profile_name')
auth_env = metadata.get('auth_env')
# Use .get() without a default to distinguish "not present" (None) from "present and empty" ([]).
raw_formats_from_metadata = metadata.get('formats_requested')
granularity_from_metadata = metadata.get('download_task_granularity', 'per_format')
# Normalize to a list for simulation logic, while preserving the original raw state for decrement logic.
formats_from_metadata = raw_formats_from_metadata
if formats_from_metadata is None:
formats_from_metadata = []
elif isinstance(formats_from_metadata, str):
formats_from_metadata = [f.strip() for f in formats_from_metadata.split(',')]
elif not isinstance(formats_from_metadata, list):
formats_from_metadata = []
except (IOError, json.JSONDecodeError) as e:
logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.")
continue # Skip to finally block to unlock profile
@ -152,10 +180,43 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...")
if args.dummy or args.dummy_batch:
logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DIRECT DOWNLOAD ==========")
logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DOCKER DOWNLOAD PER-FORMAT SIMULATION ==========")
logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}")
logger.info(f"[Worker {worker_id}] Would run command: {' '.join(shlex.quote(s) for s in cmd)}")
logger.info(f"[Worker {worker_id}] With environment: {custom_env}")
formats_to_simulate = []
# Prioritize formats from the info.json metadata if the key was present
if raw_formats_from_metadata is not None:
formats_to_simulate = formats_from_metadata
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {len(formats_to_simulate)} formats from info.json: {formats_to_simulate}")
else:
# Fallback to policy file if metadata key is absent
formats_config = d_policy.get('formats')
if isinstance(formats_config, str):
formats_to_simulate = [f.strip() for f in formats_config.split(',')]
elif isinstance(formats_config, list):
formats_to_simulate = formats_config
if not formats_to_simulate and raw_formats_from_metadata is None:
logger.info(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download for backward compatibility.")
formats_to_simulate = ['dummy_format']
# For counting purposes, we should still decrement appropriately
if granularity_from_metadata == 'per_url':
downloads_to_decrement = 1
else:
downloads_to_decrement = 1
# If granularity was 'per_url', we only decrement by 1, even if multiple
# format selectors were passed (e.g., as one string).
if granularity_from_metadata == 'per_url':
downloads_to_decrement = 1
else: # per_format or not specified
if raw_formats_from_metadata is not None:
# If key was present, trust the count from metadata. Can be 0.
downloads_to_decrement = len(formats_from_metadata)
else:
# Key was absent. Decrement by the number of formats we are simulating
# from policy, or 1 as a final fallback for old files.
downloads_to_decrement = len(formats_to_simulate) if formats_to_simulate else 1
dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
min_seconds = dummy_settings.get('download_min_seconds', 0.5)
@ -163,46 +224,39 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
failure_rate = dummy_settings.get('download_failure_rate', 0.0)
skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0)
time.sleep(random.uniform(min_seconds, max_seconds))
for i, format_id in enumerate(formats_to_simulate):
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for format '{format_id}'...")
sim_duration = random.uniform(min_seconds, max_seconds)
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {sim_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).")
time.sleep(sim_duration)
rand_val = random.random()
should_fail_skipped = rand_val < skipped_rate
should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate)
is_skipped = rand_val < skipped_rate
is_fatal = not is_skipped and rand_val < (skipped_rate + failure_rate)
success = False
details = ""
error_type = None
is_tolerated_error = False
if is_skipped:
logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download for format '{format_id}'.")
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
elif is_fatal:
logger.error(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.")
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True)
else: # success
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.")
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True)
if should_fail_skipped:
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating skipped download failure for task '{claimed_task_path.name}'.")
details = "Dummy skipped failure"
error_type = "DummySkippedFailure"
is_tolerated_error = True
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
elif should_fail_fatal:
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal download failure for task '{claimed_task_path.name}'.")
details = "Dummy fatal failure"
error_type = "DummyFailure"
profile_manager_instance.record_activity(profile_name, 'download_error')
logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========")
else:
logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating download success for task '{claimed_task_path.name}'.")
success = True
details = "Dummy success"
profile_manager_instance.record_activity(profile_name, 'download')
event = {
'type': 'direct_download',
'profile': profile_name,
'proxy_url': locked_profile['proxy'],
'success': success,
'details': details,
'error_type': error_type,
'is_tolerated_error': is_tolerated_error
}
state_manager.log_event(event)
logger.info(f"========== [Worker {worker_id}] END DUMMY DIRECT DOWNLOAD ==========")
# Determine downloads_to_decrement based on metadata
if granularity_from_metadata == 'per_url':
downloads_to_decrement = 1
else: # per_format or not specified
if raw_formats_from_metadata is not None:
# If key was present, trust the count. It can be 0.
downloads_to_decrement = len(formats_from_metadata)
else:
# Key was absent, fall back to 1 for old info.jsons
downloads_to_decrement = 1
# --- Real execution ---
logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}")
retcode, stdout, stderr = run_command(
@ -268,7 +322,13 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if claimed_task_path and auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
auth_manager.decrement_pending_downloads(auth_profile_name)
if downloads_to_decrement > 0:
auth_manager.increment_pending_downloads(auth_profile_name, count=-downloads_to_decrement)
elif downloads_to_decrement == 0:
logger.warning(f"[Worker {worker_id}] `downloads_to_decrement` was zero. Pending downloads counter for '{auth_profile_name}' will not be changed.")
else:
# This shouldn't happen, but log it
logger.error(f"[Worker {worker_id}] `downloads_to_decrement` was negative: {downloads_to_decrement}")
else:
logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
elif claimed_task_path:

View File

@ -20,6 +20,7 @@ from copy import deepcopy
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple, Union
from urllib.parse import urlparse, parse_qs
from . import utils as sp_utils
from .process_runners import run_command, run_docker_container, get_worker_id
@ -36,6 +37,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
exec_control = policy.get('execution_control', {})
gen_policy = policy.get('info_json_generation_policy', {})
queue_policy = policy.get('queue_policy', {})
task_granularity = gen_policy.get('download_task_granularity', 'per_format')
profile_prefix = gen_policy.get('profile_prefix')
if not profile_prefix:
@ -54,35 +56,58 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
batch_size = queue_policy.get('batch_size', 1)
logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
tasks = []
try:
# 1. Lock a profile FIRST
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
# 1. Get a batch of tasks from the queue FIRST
tasks = state_manager.get_auth_tasks_batch(batch_size)
if not tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(base_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 2. Lock a profile, trying any of the specified prefixes
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No profiles available to lock. Sleeping for {polling_interval}s.")
logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
state_manager.add_auth_tasks_batch(tasks)
time.sleep(polling_interval)
continue
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']
# 2. Get a batch of tasks from the queue
tasks = state_manager.get_auth_tasks_batch(batch_size)
if not tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No tasks available for profile '{profile_name}'. Unlocking and sleeping for {polling_interval}s.")
# Unlock immediately since we have no work to do.
# No cooldown is applied here to make the profile available again quickly.
profile_manager_instance.unlock_profile(profile_name, owner=owner_id)
locked_profile = None # To prevent double-unlock in finally
time.sleep(polling_interval)
continue
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
# 3. Process each task in the batch
@ -129,13 +154,34 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
stderr = "Dummy fatal failure"
else:
success = True
video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}"
# In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id'
try:
# First, try the robust library function
video_id = sp_utils.get_video_id(url)
if not video_id and url:
# Fallback parsing for dummy URLs like "...?v=dummy_0099"
parsed_url = urlparse(url)
video_id = parse_qs(parsed_url.query).get('v', [None])[0]
# Additional fallback for URLs like "youtube.com/watch?v=dummy_0028"
if not video_id and 'dummy_' in url:
dummy_match = re.search(r'dummy_\d+', url)
if dummy_match:
video_id = dummy_match.group(0)
except Exception:
video_id = None # Ensure video_id is None on parsing failure
# If all extraction methods fail, use the task_id as a reliable fallback.
if not video_id:
logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.")
video_id = task_id
info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
else:
client, req_params = state_manager.get_client_for_request(profile_name, gen_policy)
cmd = [
sys.executable, '-m', 'ytops_client.cli', 'get-info',
'--client', client, '--profile', profile_name
'--client', client or '', '--profile', profile_name
]
if proxy_url:
cmd.extend(['--proxy', proxy_url])
@ -173,61 +219,143 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
if success and info_data:
try:
auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '')
info_data['_ytops_metadata'] = {
is_dummy_run = args.dummy or args.dummy_batch
# --- 1. Determine which formats will be requested ---
formats_to_download = queue_policy.get('formats_to_download')
requested_formats_list = []
if formats_to_download:
if task_granularity == 'per_url':
format_selector = formats_to_download
if isinstance(formats_to_download, list):
format_selector = ",".join(formats_to_download)
requested_formats_list = [format_selector]
else: # per_format
formats_source = formats_to_download
if formats_source == 'from_download_policy':
formats_source = policy.get('download_policy', {}).get('formats')
if formats_source == 'all':
requested_formats_list = [f['format_id'] for f in info_data.get('formats', [])]
elif isinstance(formats_source, list):
requested_formats_list = formats_source
elif isinstance(formats_source, str):
# A comma-separated string of format groups
requested_formats_list = [f.strip() for f in formats_source.split(',')]
elif formats_source:
# Handles integer format IDs or other non-string/list values
requested_formats_list = [str(formats_source)]
else:
# formats_source is None or empty
requested_formats_list = []
# --- 2. Create download tasks and save info.json(s) based on granularity ---
download_tasks_to_create = []
created_file_paths = []
if task_granularity == 'per_format' and requested_formats_list:
for i, format_id in enumerate(requested_formats_list):
# Deepcopy to avoid modifying the original info_data in the loop
task_info_data = deepcopy(info_data)
metadata = {
'profile_name': profile_name, 'proxy_url': proxy_url,
'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(),
'task_id': task_id, 'url': url, 'auth_env': auth_env_name
'task_id': task_id, 'url': url, 'auth_env': auth_env_name,
'formats_requested': [format_id], # This task is for ONE format
'download_task_granularity': task_granularity
}
if is_dummy_run: metadata['_dummy'] = True
task_info_data['_ytops_metadata'] = metadata
final_path = None
if save_dir:
video_id = info_data.get('id', 'unknown')
video_id = task_info_data.get('id') or task_id or 'unknown_video_id'
sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy'
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.json"
# Sanitize format_id for use in filename
sanitized_format = re.sub(r'[^a-zA-Z0-9_-]', '_', format_id)
# Add index `i` to guarantee uniqueness even if sanitized format IDs clash
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}-fmt{i}_{sanitized_format}.info.json"
final_path = os.path.join(save_dir, new_name)
with open(final_path, 'w', encoding='utf-8') as f:
json.dump(task_info_data, f, indent=2)
logger.info(f"[Worker {worker_id}] [{profile_name}] Saved format-specific info.json to '{final_path}'")
created_file_paths.append(final_path)
download_tasks_to_create.append({
'info_json_path': final_path, 'format_id': format_id,
'video_id': task_info_data.get('id'), 'url': url,
'auth_profile_name': profile_name, 'proxy_url': proxy_url,
'auth_env': auth_env_name,
'original_task': task
})
else: # per_url or default (including per_format with no formats)
metadata = {
'profile_name': profile_name, 'proxy_url': proxy_url,
'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(),
'task_id': task_id, 'url': url, 'auth_env': auth_env_name,
'formats_requested': requested_formats_list,
'download_task_granularity': task_granularity
}
if is_dummy_run: metadata['_dummy'] = True
info_data['_ytops_metadata'] = metadata
final_path = None
if save_dir:
video_id = info_data.get('id') or task_id or 'unknown_video_id'
sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy'
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.info.json"
final_path = os.path.join(save_dir, new_name)
with open(final_path, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2)
logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'")
else:
# This case means auth-only (no downloads) and no save_dir specified.
# The info.json is not persisted.
logger.debug(f"[Worker {worker_id}] [{profile_name}] Auth-only task succeeded. No save_dir, so info.json is not saved.")
created_file_paths.append(final_path)
profile_manager_instance.record_activity(profile_name, 'success')
formats_to_download = queue_policy.get('formats_to_download')
download_tasks = []
if formats_to_download:
task_formats = []
if formats_to_download == 'all':
task_formats = [f['format_id'] for f in info_data.get('formats', [])]
elif isinstance(formats_to_download, list):
task_formats = formats_to_download
else:
task_formats = [str(formats_to_download)]
for format_id in task_formats:
download_tasks.append({
'info_json_path': final_path, 'format_id': format_id,
# For per_url, requested_formats_list should contain a single format selector string.
# The loop will run once, creating one task.
if requested_formats_list:
for format_selector in requested_formats_list:
download_tasks_to_create.append({
'info_json_path': final_path, 'format_id': format_selector,
'video_id': info_data.get('id'), 'url': url,
'auth_profile_name': profile_name, 'proxy_url': proxy_url,
'auth_env': auth_env_name,
'original_task': task
})
else: # Handle the case where requested_formats_list is empty
logger.debug(f"[Worker {worker_id}] [{profile_name}] No formats requested, no download tasks created.")
if download_tasks:
added_count = state_manager.add_download_tasks_batch(download_tasks)
profile_manager_instance.record_activity(profile_name, 'success', is_dummy=(args.dummy or args.dummy_batch))
# --- 3. Dispatch download tasks ---
create_download_tasks = queue_policy.get('create_download_tasks', True)
if download_tasks_to_create:
num_tasks_to_process = 0
if create_download_tasks:
added_count = state_manager.add_download_tasks_batch(download_tasks_to_create)
logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue")
profile_manager_instance.increment_pending_downloads(profile_name, count=added_count)
num_tasks_to_process = added_count
else:
num_tasks = len(download_tasks_to_create)
logger.info(f"[Worker {worker_id}] [{profile_name}] File-based workflow: created {num_tasks} tasks (no queue tasks created)")
num_tasks_to_process = num_tasks
state_manager.report_auth_success(task_id, {
if num_tasks_to_process > 0:
profile_manager_instance.increment_pending_downloads(profile_name, count=num_tasks_to_process)
profile_manager_instance.record_predicted_downloads(profile_name, count=num_tasks_to_process, is_dummy=(args.dummy or args.dummy_batch))
report_payload = {
"url": url, "video_id": info_data.get('id'), "profile_name": profile_name,
"proxy_url": proxy_url, "info_json_path": final_path,
"download_tasks_created": len(download_tasks)
})
"proxy_url": proxy_url, "info_json_path": created_file_paths[0] if len(created_file_paths) == 1 else created_file_paths,
"download_tasks_created": len(download_tasks_to_create)
}
if is_dummy_run:
report_payload['_dummy'] = True
state_manager.report_auth_success(task_id, report_payload)
except Exception as e:
logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True)
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url})
else:
is_bot_error = "Sign in to confirm you're not a bot" in stderr
@ -240,19 +368,19 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
if is_unavailable or is_private or is_deleted or is_dummy_skipped:
reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped"
logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr})
else:
error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}"
logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}")
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode})
except Exception as e:
logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True)
if task_id:
state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"})
profile_manager_instance.record_activity(profile_name, 'failure')
profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
finally:
if temp_task_dir and os.path.exists(temp_task_dir):
shutil.rmtree(temp_task_dir)
@ -262,7 +390,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
except Exception as e:
logger.error(f"[Worker {worker_id}] Unexpected error in outer worker loop: {e}", exc_info=True)
if locked_profile:
profile_manager_instance.record_activity(locked_profile['name'], 'failure')
profile_manager_instance.record_activity(locked_profile['name'], 'failure', is_dummy=(args.dummy or args.dummy_batch))
finally:
if locked_profile:
@ -276,16 +404,27 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
elif isinstance(val, int):
cooldown = val
except (json.JSONDecodeError, TypeError):
if cooldown_config.isdigit():
if isinstance(cooldown_config, str) and cooldown_config.isdigit():
cooldown = int(cooldown_config)
if cooldown:
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.")
# For auth simulation, check if profile has pending downloads
# If so, don't apply cooldown to avoid conflicting with enforcer
profile_data = profile_manager_instance.get_profile(locked_profile['name'])
should_apply_cooldown = cooldown
if profile_data:
pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name'])
rest_reason = profile_data.get('rest_reason')
if pending_downloads > 0 or rest_reason == 'waiting_downloads':
should_apply_cooldown = None
logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.")
if should_apply_cooldown:
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {should_apply_cooldown}s.")
profile_manager_instance.unlock_profile(
locked_profile['name'],
owner=owner_id,
rest_for_seconds=cooldown
rest_for_seconds=should_apply_cooldown
)
logger.info(f"[Worker {worker_id}] Queue auth worker exiting.")
@ -311,6 +450,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'")
task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
@ -322,7 +462,14 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
task = state_manager.get_download_task()
if not task:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No download tasks available in queue. Sleeping for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No download tasks available in queue."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.debug(f"{base_log_msg} Sleeping for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
continue
@ -377,20 +524,45 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
if specific_profile:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile)
if not locked_profile:
logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile with prefix.")
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
else:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile from configured prefixes.")
if not locked_profile:
# Lock from any of the specified prefixes
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
if not locked_profile:
# Fallback for empty/no prefix, or if no profile from list was lockable
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.warning(f"[Worker {worker_id}] No profiles available for task {task_id}. Re-queueing and sleeping for {polling_interval}s.")
base_log_msg = f"[Worker {worker_id}] No profiles available, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.info(f"{base_log_msg} Re-queueing task {task_id} and sleeping for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# Re-queue the task by adding it back to the inbox.
state_manager.add_download_tasks_batch([task])
# The 'in_progress' marker for this attempt will be cleaned up by the finally block.
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name']
proxy_url = locked_profile['proxy']
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' with proxy '{proxy_url}'")
@ -480,7 +652,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
except Exception as e:
logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True)
profile_manager_instance.record_activity(profile_name, 'download_success')
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_success(task_id, {
"video_id": video_id, "url": url, "format_id": format_id,
"profile_name": profile_name, "proxy_url": proxy_url,
@ -508,7 +680,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
if is_unavailable or is_format_error:
logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_skipped(task_id, {
"video_id": video_id, "url": url, "format_id": format_id,
"reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr
@ -516,23 +688,13 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
else:
error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}"
logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}")
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_failure(task_id, {
"video_id": video_id, "url": url, "format_id": format_id,
"error_type": error_type, "stderr": stderr, "exit_code": retcode,
"original_task": task
})
# Decrement pending downloads counter on the original auth profile, regardless of outcome
if auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
auth_manager.decrement_pending_downloads(auth_profile_name)
else:
logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
elif task: # Only warn if we had a task but couldn't get metadata
logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented.")
except Exception as e:
logger.error(f"[Worker {worker_id}] Unexpected error: {e}", exc_info=True)
if task and task_id:
@ -543,9 +705,22 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
"error": f"Unexpected error: {str(e)}", "original_task": task
})
if locked_profile:
profile_manager_instance.record_activity(locked_profile['name'], 'download_error')
profile_manager_instance.record_activity(locked_profile['name'], 'download_error', is_dummy=(args.dummy or args.dummy_batch))
finally:
# Decrement pending downloads counter on the original auth profile, regardless of outcome.
# This is critical for releasing the auth profile from the 'waiting_downloads' state.
if task: # Only attempt to decrement if a task was actually pulled.
if auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
new_count = auth_manager.decrement_pending_downloads(auth_profile_name)
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'. New count: {new_count}")
else:
logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
else:
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.")
if locked_profile:
cooldown = None
cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds')

View File

@ -904,6 +904,22 @@ class StateManager:
return self.queue_provider.remove_in_progress(self.queue_provider.AUTH_PROGRESS, task_id)
def add_auth_task(self, task: Dict) -> bool:
"""Add an authentication task to the queue."""
if not self.queue_provider:
logger.error("Queue provider not initialized")
return False
return self.queue_provider.add_task(self.queue_provider.AUTH_INBOX, task)
def add_auth_tasks_batch(self, tasks: List[Dict]) -> int:
"""Add a batch of authentication tasks to the queue."""
if not self.queue_provider:
logger.error("Queue provider not initialized")
return 0
return self.queue_provider.add_tasks_batch(self.queue_provider.AUTH_INBOX, tasks)
def add_download_task(self, task: Dict) -> bool:
"""Add a download task to the queue."""
if not self.queue_provider:

View File

@ -40,6 +40,7 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
return []
no_task_streak = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set():
locked_profile = None
@ -48,7 +49,15 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
# 0. If no tasks were found previously, pause briefly.
if no_task_streak > 0:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.info(f"[Worker {worker_id}] No tasks found in previous attempt(s). Pausing for {polling_interval}s. (Streak: {no_task_streak})")
base_log_msg = f"[Worker {worker_id}] No available tasks found for any active profiles."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue
@ -60,9 +69,6 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
if not locked_profile:
# No task/profile combo was available.
no_task_streak += 1
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.info(f"[Worker {worker_id}] No available tasks found for any active profiles. Pausing for {polling_interval}s.")
time.sleep(polling_interval)
continue
profile_name = locked_profile['name']
@ -70,6 +76,9 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
# We have a task and a lock.
if claimed_task_path:
no_task_streak = 0 # Reset streak
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 3. Process the task
try:
with open(claimed_task_path, 'r', encoding='utf-8') as f:

View File

@ -319,7 +319,7 @@ def apply_overrides(policy, overrides):
return policy
def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None):
def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None):
"""Prints a human-readable summary of the effective policy."""
logger = logging.getLogger(__name__)
logger.info(f"--- Effective Policy: {name} ---")
@ -328,6 +328,8 @@ def display_effective_policy(policy, name, sources=None, profile_names=None, ori
orchestration_mode = settings.get('orchestration_mode')
logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
if args and args.profile_prefix:
logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}")
if profile_names:
num_profiles = len(profile_names)
logger.info(f"Profiles found: {num_profiles}")

View File

@ -188,7 +188,7 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy
if not info_json_dir:
return None, None
logger.info(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...")
logger.debug(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...")
# 1. Get all available task files and group by profile
try:
@ -223,7 +223,21 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy
# 2. Get ACTIVE profiles from Redis.
active_profiles = profile_manager.list_profiles(state_filter='ACTIVE')
active_profile_names = {p['name'] for p in active_profiles if p['name'].startswith(profile_prefix or '')}
prefixes_to_check = []
if profile_prefix:
prefixes_to_check = [p.strip() for p in profile_prefix.split(',') if p.strip()]
active_profile_names = set()
if prefixes_to_check:
for p in active_profiles:
for prefix in prefixes_to_check:
if p['name'].startswith(prefix):
active_profile_names.add(p['name'])
break
else:
# If no prefixes are specified, consider all active profiles.
active_profile_names = {p['name'] for p in active_profiles}
# 3. Find profiles that are both ACTIVE and have tasks.
candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys()))
@ -918,7 +932,8 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_
result['proxy_url'] = proxy_url
if profile_manager_instance and profile_name:
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
is_dummy = args.dummy or args.dummy_batch
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
state_manager.log_event(result)
results.append(result)
@ -960,14 +975,15 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_
# --- Record activity on the DOWNLOAD profile that performed the work ---
if profile_manager_instance and profile_name:
is_dummy = args.dummy or args.dummy_batch
if result.get('success'):
profile_manager_instance.record_activity(profile_name, 'download')
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy)
elif result.get('error_type') == 'Cancelled':
pass # Do not record cancellations
elif result.get('is_tolerated_error'):
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
else:
profile_manager_instance.record_activity(profile_name, 'download_error')
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
state_manager.log_event(result)
results.append(result)

View File

@ -495,7 +495,7 @@ def main_stress_policy(args):
exec_control['workers'] = calculated_workers
logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').")
sp_utils.display_effective_policy(policy, policy_name, sources=[], original_workers_setting=original_workers_setting)
sp_utils.display_effective_policy(policy, policy_name, args, sources=[], original_workers_setting=original_workers_setting)
if args.dry_run: return 0
workers = exec_control.get('workers', 1)
@ -578,15 +578,45 @@ def main_stress_policy(args):
logger.info(f"Starting/resuming from URL index {start_index + 1}.")
# The worker's get_next_url_batch will respect this starting index.
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list)
sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0
workers = exec_control.get('workers', 1)
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_direct_batch_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
for i in range(pool_workers):
assigned_prefix = prefixes[i % len(prefixes)]
worker_policy = deepcopy(policy)
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix
worker_specs.append({
'func': run_direct_batch_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# Wait for all workers to complete. They will exit their loops when no URLs are left.
concurrent.futures.wait(futures)
if shutdown_event.is_set():
@ -662,14 +692,46 @@ def main_stress_policy(args):
if start_index > 0:
logger.info(f"Starting/resuming from URL index {start_index + 1}.")
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list)
sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_direct_docker_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_docker_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# This worker runs until shutdown, like the download worker
shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct docker workers to finish...")
@ -686,14 +748,46 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1
sp_utils.display_effective_policy(policy, policy_name, sources=[])
sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_direct_docker_download_worker, i, policy, state_manager, args, profile_manager_instance, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('download_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_docker_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# This worker runs until shutdown
shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct docker download workers to finish...")
@ -726,15 +820,46 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1
sp_utils.display_effective_policy(policy, policy_name, sources=[])
sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0
workers = exec_control.get('workers', 1)
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_direct_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('download_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct download workers to finish...")
concurrent.futures.wait(futures)
@ -786,6 +911,15 @@ def main_stress_policy(args):
logger.error(f"Failed to create save directory '{save_dir}': {e}")
return 1
# In dummy mode, do not populate the queue. Warn the user instead.
if args.dummy or args.dummy_batch:
input_queue = queue_policy.get('input_queue', 'queue2_auth_inbox')
logger.warning("--- Dummy Mode Notice ---")
logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.")
logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.")
logger.warning(f"You can populate it using another tool, for example: ./bin/ytops-client queue push {input_queue} --payload-json '{{\"url\":\"https://youtu.be/dQw4w9WgXcQ\"}}' --count 100")
logger.warning("-------------------------")
# Requeue failed tasks if requested
if args.requeue_failed:
requeued = state_manager.requeue_failed_auth_tasks(
@ -793,15 +927,46 @@ def main_stress_policy(args):
)
logger.info(f"Requeued {requeued} failed authentication tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[])
sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0
workers = exec_control.get('workers', 1)
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_queue_auth_worker, i, policy, state_manager, args, auth_manager, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_queue_auth_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, auth_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait()
logger.info("Shutdown signal received, waiting for queue auth workers to finish...")
concurrent.futures.wait(futures)
@ -843,6 +1008,17 @@ def main_stress_policy(args):
env_prefix=env_prefix
)
# In dummy mode, do not populate the queue. Warn the user instead.
if args.dummy or args.dummy_batch:
input_queue = queue_policy.get('input_queue', 'queue2_dl_inbox')
logger.warning("--- Dummy Mode Notice ---")
logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.")
logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.")
logger.warning("-------------------------")
# Get download policy
d_policy = policy.get('download_policy', {})
# Create output directory if specified
output_dir = d_policy.get('output_dir')
if output_dir:
@ -860,15 +1036,46 @@ def main_stress_policy(args):
)
logger.info(f"Requeued {requeued} failed download tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[])
sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0
workers = exec_control.get('workers', 1)
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
futures = [
executor.submit(run_queue_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock)
for i in range(workers)
]
worker_pools = exec_control.get('worker_pools', [])
if not worker_pools and exec_control.get('workers'):
prefix = policy.get('download_policy', {}).get('profile_prefix')
worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_queue_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait()
logger.info("Shutdown signal received, waiting for queue download workers to finish...")
concurrent.futures.wait(futures)
@ -949,7 +1156,7 @@ def main_stress_policy(args):
)
logger.info(f"Requeued {requeued_auth} failed authentication tasks and {requeued_dl} failed download tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[])
sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0
# Start both auth and download workers
@ -1127,6 +1334,7 @@ def main_stress_policy(args):
sp_utils.display_effective_policy(
policy,
policy_name,
args,
sources=sources,
profile_names=None, # Profile grouping is removed
original_workers_setting=original_workers_setting
@ -1254,12 +1462,22 @@ def main_stress_policy(args):
# If marking by rename is on, do that.
if settings.get('mark_processed_files'):
try:
processed_dir = settings.get('processed_files_dir')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
new_path = source.parent / f"{source.name}.{timestamp}.processed"
new_filename = f"{source.name}.{timestamp}.processed"
if processed_dir:
dest_path = Path(processed_dir) / new_filename
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(source), str(dest_path))
logger.info(f"Marked '{source.name}' as processed by moving to '{dest_path}'")
else:
# Fallback to old behavior: rename in place
new_path = source.parent / new_filename
source.rename(new_path)
logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'")
except (IOError, OSError) as e:
logger.error(f"Failed to rename processed file '{source.name}': {e}")
logger.error(f"Failed to mark processed file '{source.name}': {e}")
# When using profile-aware mode, the file processing (including marking as
# processed) is handled inside process_profile_task.