Update ansible works with auth and downloads on dl machine, while enforcer on master

This commit is contained in:
aperez 2026-01-11 01:21:25 +03:00
parent db78171281
commit 7a514ab8ce
45 changed files with 3377 additions and 471 deletions

View File

@ -1,7 +1,5 @@
vault_redis_password: "rOhTAIlTFFylXsjhqwxnYxDChFc" vault_redis_password: "rOhTAIlTFFylXsjhqwxnYxDChFc"
vault_postgres_password: "pgdb_pwd_A7bC2xY9zE1wV5uP" vault_redis_port: 52909
vault_airflow_admin_password: "2r234sdfrt3q454arq45q355"
vault_flower_password: "dO4eXm7UkF81OdMvT8E2tIKFtPYPCzyzwlcZ4RyOmCsmG4qzrNFqM5sNTOT9"
vault_vnc_password: "vnc_pwd_Z5xW8cV2bN4mP7lK" vault_vnc_password: "vnc_pwd_Z5xW8cV2bN4mP7lK"
vault_ss_password_1: "UCUAR7vRO/u9Zo71nfA13c+/b1MCiJpfZJo+EmEBCfA=" vault_ss_password_1: "UCUAR7vRO/u9Zo71nfA13c+/b1MCiJpfZJo+EmEBCfA="
vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4=" vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4="

View File

@ -0,0 +1,102 @@
---
# This file is managed by include_tasks and is not a standalone playbook.
# It provides generic tasks for managing a process within a tmux session.
# Required variables:
# - tmux_session_name: The name of the tmux session.
# - working_dir: The directory where the command should be executed.
# - command_to_run: The command to execute inside the tmux session.
# - process_grep_pattern: A regex pattern to identify the process for pkill/status check.
#
# Optional variables (control flags):
# - start_process: (bool) Set to true to start the process. Default: false.
# - stop_process: (bool) Set to true to stop the process. Default: false.
# - check_status: (bool) Set to true to check and display the process status. Default: false.
- name: Ensure tmux is installed
ansible.builtin.apt:
name: tmux
state: present
become: yes
run_once: true # No need to run this on every include
- name: Stop existing tmux session
ansible.builtin.shell:
cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true"
when: stop_process | default(false) | bool
- name: Kill any orphaned processes
ansible.builtin.shell:
cmd: |
PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}')
if [ -n "$PIDS" ]; then
kill $PIDS >/dev/null 2>&1 || true
sleep 0.5
kill -9 $PIDS >/dev/null 2>&1 || true
fi
changed_when: false
when: stop_process | default(false) | bool
- name: Stop existing process before starting (makes start idempotent)
block:
- name: Stop existing tmux session
ansible.builtin.shell:
cmd: "tmux kill-session -t {{ tmux_session_name }} 2>/dev/null || true"
- name: Kill any orphaned processes
ansible.builtin.shell:
cmd: |
PIDS=$(ps aux | grep -E "{{ process_grep_pattern }}" | grep -v "grep" | awk '{print $2}')
if [ -n "$PIDS" ]; then
kill $PIDS >/dev/null 2>&1 || true
sleep 0.5
kill -9 $PIDS >/dev/null 2>&1 || true
fi
changed_when: false
when: start_process | default(false) | bool
- name: Ensure client script is executable
ansible.builtin.file:
path: "{{ working_dir }}/bin/ytops-client"
mode: "a+x"
when: start_process | default(false) | bool
- name: Display command for tmux session
ansible.builtin.debug:
msg: "Command for tmux session '{{ tmux_session_name }}': {{ command_to_run }}"
when: start_process | default(false) | bool
- name: Start process in tmux session
ansible.builtin.shell:
cmd: |
cd {{ working_dir }}
# The command is wrapped with a final sleep to keep the tmux session alive for debugging even if the process fails.
# The actual command is run in a subshell (...) to prevent 'exec' from terminating the parent shell.
tmux new-session -d -s {{ tmux_session_name }} \
"if [ -f .env ]; then set -a; . ./.env; set +a; fi; \
COMMAND_TO_RUN='{{ command_to_run | replace("'", "'\\''") }}'; \
echo '>>> Running command:'; \
echo "\$COMMAND_TO_RUN"; \
echo '---'; \
(eval "\$COMMAND_TO_RUN") ; \
echo; echo '---'; echo 'Process exited. This tmux session will remain open for debugging.'; echo 'You can attach with: tmux attach -t {{ tmux_session_name }}'; \
sleep 3600"
when: start_process | default(false) | bool
- name: Check process status
ansible.builtin.shell:
cmd: |
echo "Process status on {{ inventory_hostname }} for session '{{ tmux_session_name }}':"
if tmux has-session -t {{ tmux_session_name }} 2>/dev/null; then
echo " - Tmux session '{{ tmux_session_name }}' is running"
ps aux | grep -E "{{ process_grep_pattern }}" | grep -v grep || echo " - No matching process found"
else
echo " - Tmux session '{{ tmux_session_name }}' is NOT running"
fi
register: status_check
changed_when: false
when: check_status | default(false) | bool
- name: Display status
ansible.builtin.debug:
msg: "{{ status_check.stdout_lines }}"
when: check_status | default(false) | bool

219
ansible/playbook-README.md Normal file
View File

@ -0,0 +1,219 @@
# Ansible Playbooks Documentation
This document provides an overview of all available playbooks, their purpose, and how to use them operationally.
## Table of Contents
1. [Deployment Workflow](#deployment-workflow)
2. [Initial Setup Playbooks](#initial-setup-playbooks)
3. [Operational Playbooks](#operational-playbooks)
4. [Monitoring and Inspection](#monitoring-and-inspection)
5. [Common Operations](#common-operations)
## Deployment Workflow
The typical deployment workflow follows these steps:
1. **Initial Setup**: Configure machines, deploy proxies, install dependencies
2. **Master Setup**: Configure Redis, MinIO, and other infrastructure services
3. **Worker Setup**: Configure auth generators and download simulators
4. **Operational Management**: Start/stop processes, monitor status, cleanup profiles
## Initial Setup Playbooks
For a complete, automated installation on fresh nodes, you can use the main `full-install` playbook:
```bash
# Run the complete installation process on all nodes
ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini
```
The steps below are for running each part of the installation manually.
### Base Installation
```bash
# Deploy base system requirements to all nodes
ansible-playbook ansible/playbook-base-system.yml -i ansible/inventory.green.ini
```
### Proxy Deployment
```bash
# Deploy shadowsocks proxies to all nodes (as defined in cluster config)
ansible-playbook ansible/playbook-proxies.yml -i ansible/inventory.green.ini
```
### Code, Environment, and Dependencies
```bash
# Sync code to all nodes
ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
# Generate .env files for all nodes
ansible-playbook ansible/playbook-stress-generate-env.yml -i ansible/inventory.green.ini
# Install dependencies on all nodes
ansible-playbook ansible/playbook-stress-install-deps.yml -i ansible/inventory.green.ini
```
## Operational Playbooks
### Master Node Services
Redis and MinIO are deployed as Docker containers during the initial setup (`playbook-full-install.yml`).
To start the policy enforcer and monitoring tmux sessions on the master node:
```bash
# Start policy enforcer and monitoring on master
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "start_enforcer=true start_monitor=true"
```
To stop processes on the master node:
```bash
# Stop ONLY the policy enforcer
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_enforcer=true"
# Stop ONLY the monitor
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_monitor=true"
# Stop BOTH the enforcer and monitor
ansible-playbook ansible/playbook-stress-manage-processes.yml -i ansible/inventory.green.ini -e "stop_sessions=true"
```
### Worker Node Processes
```bash
# Start auth generators and download simulators on all workers
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start"
# Stop all processes on workers
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop"
# Check status of all worker processes
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=status"
```
### Profile Management
```bash
# Clean up all profiles
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles"
# Clean up specific profile prefix
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=cleanup-profiles" -e "profile_prefix=user1"
```
## Monitoring and Inspection
### Status Checks
```bash
# Check status of all processes on all nodes
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=status"
# Check enforcer status on master
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=check-enforcer"
# Check profile status
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=profile-status"
```
### Log Inspection
```bash
# View tmux session output on a specific node
ssh user@hostname
tmux attach -t stress-auth-user1,user2 # For auth generator
tmux attach -t stress-download-user1,user2 # For download simulator
tmux attach -t stress-enforcer # For policy enforcer on master
```
## Common Operations
### Code and Policy Updates
First, sync your local changes to the jump host from your development machine:
```bash
# Sync project to jump host
./tools/sync-to-jump.sh
```
Then, from the jump host, you can sync code or policies to the cluster nodes:
```bash
# Sync all application code (Python sources, scripts, etc.)
ansible-playbook ansible/playbook-stress-sync-code.yml -i ansible/inventory.green.ini
# Sync only policies and CLI configs
ansible-playbook ansible/playbook-stress-sync-configs.yml -i ansible/inventory.green.ini
```
### Adding a New Worker
1. Update `cluster.green.yml` with the new worker definition:
```yaml
workers:
new-worker:
ip: x.x.x.x
port: 22
profile_prefixes:
- "user4"
proxies:
- "sslocal-rust-1090"
```
2. Regenerate inventory:
```bash
./tools/generate-inventory.py cluster.green.yml
```
3. Run the full installation playbook, limiting it to the new worker:
```bash
ansible-playbook ansible/playbook-full-install.yml -i ansible/inventory.green.ini --limit new-worker
```
4. Start processes on the new worker:
```bash
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=start" --limit new-worker
```
### Removing a Worker
1. Stop all processes on the worker:
```bash
ansible-playbook ansible/playbook-stress-lifecycle.yml -i ansible/inventory.green.ini -e "action=stop" --limit worker-to-remove
```
2. Remove the worker from `cluster.green.yml`
3. Regenerate inventory:
```bash
./tools/generate-inventory.py cluster.green.yml
```
### Emergency Stop All
```bash
# Stop all processes on all nodes
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-all"
```
### Stopping Specific Nodes
To stop processes on a specific worker or group of workers, you can use the `stop-nodes` action and limit the playbook run.
```bash
# Stop all processes on a single worker (e.g., dl003)
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit dl003
# Stop all processes on all workers
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=stop-nodes" --limit workers
```
### Restart Enforcer and Monitoring
```bash
# Restart monitoring and enforcer on master
ansible-playbook ansible/playbook-stress-control.yml -i ansible/inventory.green.ini -e "action=restart-monitoring"
```

View File

@ -3,9 +3,15 @@
hosts: all hosts: all
gather_facts: yes gather_facts: yes
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml" # Assumes generate-inventory.py was run with cluster.stress.yml
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks: pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
- name: Announce base system setup - name: Announce base system setup
ansible.builtin.debug: ansible.builtin.debug:
msg: "Starting base system setup on {{ inventory_hostname }}" msg: "Starting base system setup on {{ inventory_hostname }}"

View File

@ -20,12 +20,19 @@
hosts: all hosts: all
gather_facts: no gather_facts: no
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Define base directory for node - name: Define base directory for node
ansible.builtin.set_fact: ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure base directories and subdirectories exist - name: Ensure base directories and subdirectories exist
ansible.builtin.file: ansible.builtin.file:
@ -41,12 +48,12 @@
- "run/docker_mount/fetched_info_jsons" - "run/docker_mount/fetched_info_jsons"
become: yes become: yes
- name: "PHASE 2.2: Import playbook to install Python dependencies" - name: "PHASE 2.2: Import playbook to sync local code"
import_playbook: playbook-stress-install-deps.yml
- name: "PHASE 2.3: Import playbook to sync local code"
import_playbook: playbook-stress-sync-code.yml import_playbook: playbook-stress-sync-code.yml
- name: "PHASE 2.3: Import playbook to install Python dependencies"
import_playbook: playbook-stress-install-deps.yml
# ------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------
# PHASE 3: Environment and Service Configuration # PHASE 3: Environment and Service Configuration
# Generates the .env file and starts the role-specific services on master and workers. # Generates the .env file and starts the role-specific services on master and workers.
@ -55,11 +62,17 @@
import_playbook: playbook-stress-generate-env.yml import_playbook: playbook-stress-generate-env.yml
- name: "PHASE 3.2: Master Node Services Setup" - name: "PHASE 3.2: Master Node Services Setup"
hosts: airflow_master hosts: master
gather_facts: no gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Configure system performance and kernel settings - name: Configure system performance and kernel settings
ansible.builtin.copy: ansible.builtin.copy:
@ -94,6 +107,16 @@
mode: '0644' mode: '0644'
become: yes become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start master services (Redis, MinIO) - name: Start master services (Redis, MinIO)
community.docker.docker_compose_v2: community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}" project_src: "{{ airflow_master_dir }}"
@ -148,16 +171,63 @@
environment: environment:
HOME: "/home/{{ ansible_user }}" HOME: "/home/{{ ansible_user }}"
- name: "PHASE 3.3: Shared Storage Setup (s3fs)" - name: "PHASE 3.2a: Worker Node Services Setup"
hosts: airflow_master:airflow_workers hosts: workers
gather_facts: no gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Template Docker Compose file for worker services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start worker services
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
- name: "PHASE 3.3: Shared Storage Setup (s3fs)"
hosts: master:workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Define base directory for node - name: Define base directory for node
ansible.builtin.set_fact: ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Mount S3 buckets via s3fs - name: Mount S3 buckets via s3fs
block: block:
@ -176,14 +246,124 @@
mode: '0600' mode: '0600'
become: yes become: yes
- name: Check if mount points are already mounted
ansible.builtin.shell:
cmd: "mount | grep -q '{{ item.path }}'"
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
register: mount_check
changed_when: false
failed_when: false
- name: Ensure mount point directories exist (only if not mounted)
ansible.builtin.file:
path: "{{ item.item.path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
loop: "{{ mount_check.results }}"
when: item.rc != 0
become: yes
- name: Mount S3 buckets for stress testing - name: Mount S3 buckets for stress testing
ansible.posix.mount: ansible.posix.mount:
src: "s3fs#{{ item.bucket }}" src: "s3fs#{{ item.bucket }}"
path: "{{ item.path }}" path: "{{ item.path }}"
fstype: fuse fstype: fuse
opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['airflow_master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs" opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs"
state: mounted state: mounted
loop: loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' } - { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' } - { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
become: yes become: yes
- name: "PHASE 3.4: Import playbook to initialize Redis profiles"
import_playbook: playbook-stress-init-redis.yml
# -------------------------------------------------------------------------------------------------
# PHASE 4: Monitoring and Management Services Setup
# Starts monitoring, enforcer, and simulation processes.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 4.1: Import playbook to manage monitoring and enforcer processes"
import_playbook: playbook-stress-manage-processes.yml
- name: "PHASE 4.2: Start monitoring and enforcer services"
hosts: master
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
start_monitor: true
start_enforcer: true
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Ensure tmux is installed
ansible.builtin.apt:
name: tmux
state: present
become: yes
- name: Start profile monitoring in tmux session
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
tmux new-session -d -s stress-monitor \
"set -a && . ./.env && set +a && \
./bin/ytops-client profile list \
--auth-env sim_auth \
--download-env sim_download \
--live \
--no-blink \
--show-reasons"
when: start_monitor | default(false) | bool
- name: Start policy enforcer in tmux session
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
tmux new-session -d -s stress-enforcer \
"set -a && . ./.env && set +a && \
./bin/ytops-client policy-enforcer \
--policy policies/8_unified_simulation_enforcer.yaml \
--live"
when: start_enforcer | default(false) | bool
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"
# -------------------------------------------------------------------------------------------------
# PHASE 5: Simulation Workload Generation (Optional - can be run manually)
# These playbooks are available but not automatically started by default.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 5.1: Note about simulation workload generation"
hosts: localhost
gather_facts: no
tasks:
- name: Display note about simulation playbooks
ansible.builtin.debug:
msg: |
Simulation workload generation playbooks are available:
- ansible/playbook-stress-auth-generator.yml
- ansible/playbook-stress-download-simulation.yml
To start simulations manually, run:
ansible-playbook ansible/playbook-stress-auth-generator.yml \
-e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3"
ansible-playbook ansible/playbook-stress-download-simulation.yml \
-e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \
--limit airflow_workers[0]

View File

@ -0,0 +1,404 @@
---
# This playbook provides a complete installation for fresh nodes.
# It can install either master or worker roles, or both on the same machine.
#
# Usage examples:
# # Install everything on all nodes
# ansible-playbook ansible/playbook-full-install.yml
#
# # Install only on workers
# ansible-playbook ansible/playbook-full-install.yml --limit workers
#
# # Install only on master
# ansible-playbook ansible/playbook-full-install.yml --limit master
#
# # Install both roles on a single machine
# ansible-playbook ansible/playbook-full-install.yml --limit specific-host -e "install_master=true install_worker=true"
# -------------------------------------------------------------------------------------------------
# PHASE 1: Base System Configuration
# Ensures all nodes have the necessary base packages, user configurations, and Docker installed.
# -------------------------------------------------------------------------------------------------
- name: "PHASE 1: Import base system setup playbook"
import_playbook: playbook-base-system.yml
# -------------------------------------------------------------------------------------------------
# PHASE 2: Generate Environment Configuration
# Creates .env files needed by all subsequent steps
# -------------------------------------------------------------------------------------------------
- name: "PHASE 2: Generate .env configuration"
import_playbook: playbook-stress-generate-env.yml
# -------------------------------------------------------------------------------------------------
# PHASE 3: Docker Network Setup
# Ensures the shared Docker network exists before building containers
# -------------------------------------------------------------------------------------------------
- name: "PHASE 3: Ensure Docker network exists"
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Create shared Docker network
community.docker.docker_network:
name: "{{ docker_network_name }}"
driver: bridge
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 4: Build yt-dlp Docker Image
# Builds the yt-dlp container from bin/ directory
# -------------------------------------------------------------------------------------------------
- name: "PHASE 4: Build yt-dlp Docker image"
hosts: all
gather_facts: no
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if (inventory_hostname in groups['master'] and not (install_worker | default(false) | bool)) else airflow_worker_dir }}"
- name: Ensure bin directory exists
ansible.builtin.file:
path: "{{ base_dir }}/bin"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Check if Dockerfile exists in bin directory
ansible.builtin.stat:
path: "{{ base_dir }}/bin/Dockerfile"
register: dockerfile_stat
- name: Build yt-dlp Docker image if Dockerfile exists
community.docker.docker_image:
name: yt-dlp-custom
tag: latest
source: build
build:
path: "{{ base_dir }}/bin"
pull: yes
state: present
force_source: yes
become: yes
when: dockerfile_stat.stat.exists
- name: Display message if Dockerfile not found
ansible.builtin.debug:
msg: "Dockerfile not found at {{ base_dir }}/bin/Dockerfile - skipping yt-dlp image build"
when: not dockerfile_stat.stat.exists
# -------------------------------------------------------------------------------------------------
# PHASE 5: Sync Code and Install Dependencies
# Copies application code and installs Python dependencies
# -------------------------------------------------------------------------------------------------
- name: "PHASE 5.1: Sync application code"
import_playbook: playbook-stress-sync-code.yml
- name: "PHASE 5.2: Install Python dependencies"
import_playbook: playbook-stress-install-deps.yml
# -------------------------------------------------------------------------------------------------
# PHASE 6: Deploy Shadowsocks Proxies
# Configures and starts proxy services
# -------------------------------------------------------------------------------------------------
- name: "PHASE 6: Deploy proxy services"
import_playbook: playbook-proxies.yml
# -------------------------------------------------------------------------------------------------
# PHASE 7: Install bgutils
# Note: Currently bgutils is deployed on master via docker-compose
# -------------------------------------------------------------------------------------------------
- name: "PHASE 7: Install bgutils"
import_playbook: playbook-install-bgutils.yml
# -------------------------------------------------------------------------------------------------
# PHASE 8: Master-Specific Services Setup
# Starts Redis, MinIO, and other master-only services
# -------------------------------------------------------------------------------------------------
- name: "PHASE 8: Master Node Services Setup"
hosts: master
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Configure system performance and kernel settings
ansible.builtin.copy:
src: "configs/etc/sysctl.d/99-system-limits.conf"
dest: "/etc/sysctl.d/99-system-limits.conf"
owner: root
group: root
mode: '0644'
become: yes
register: sysctl_config_copy
- name: Apply sysctl settings
ansible.builtin.command: sysctl --system
become: yes
when: sysctl_config_copy.changed
- name: Ensure MinIO data directory exists
ansible.builtin.file:
path: "{{ airflow_master_dir }}/minio-data"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Template Docker Compose file for master services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_master_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start master services (Redis, MinIO)
community.docker.docker_compose_v2:
project_src: "{{ airflow_master_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
- name: Wait for MinIO service to be ready
ansible.builtin.wait_for:
host: "{{ hostvars[inventory_hostname].ansible_host }}"
port: 9000
delay: 5
timeout: 60
delegate_to: localhost
- name: Download MinIO Client (mc) if not present
ansible.builtin.command:
cmd: wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc
creates: /usr/local/bin/mc
become: yes
- name: Ensure MinIO Client (mc) is executable
ansible.builtin.file:
path: /usr/local/bin/mc
mode: '0755'
become: yes
- name: Configure mc alias for local MinIO
ansible.builtin.command: >
mc alias set local http://localhost:9000 {{ vault_s3_access_key_id }} {{ vault_s3_secret_access_key }}
become: yes
become_user: "{{ ansible_user }}"
changed_when: false
environment:
HOME: "/home/{{ ansible_user }}"
- name: Ensure S3 buckets exist in MinIO using mc
ansible.builtin.command: >
mc mb local/{{ item }}
loop:
- "stress-inputs"
- "stress-jsons"
become: yes
become_user: "{{ ansible_user }}"
register: mc_mb_result
failed_when: >
mc_mb_result.rc != 0 and
"already exists" not in mc_mb_result.stderr
changed_when: mc_mb_result.rc == 0
environment:
HOME: "/home/{{ ansible_user }}"
# -------------------------------------------------------------------------------------------------
# PHASE 9: Worker-Specific Services Setup
# Starts worker-only services if needed
# -------------------------------------------------------------------------------------------------
- name: "PHASE 9: Worker Node Services Setup"
hosts: workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Template Docker Compose file for worker services
ansible.builtin.template:
src: templates/docker-compose.stress-master.j2
dest: "{{ airflow_worker_dir }}/docker-compose.stress.yml"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes
- name: Stop and remove existing containers before starting services
ansible.builtin.shell:
cmd: |
docker ps -a --filter "name=bgutil-provider" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=redis-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
docker ps -a --filter "name=minio-stress" --format "{{{{.ID}}}}" | xargs -r docker rm -f
become: yes
changed_when: false
ignore_errors: yes
- name: Start worker services
community.docker.docker_compose_v2:
project_src: "{{ airflow_worker_dir }}"
files:
- docker-compose.stress.yml
state: present
remove_orphans: true
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 10: Shared Storage Setup (s3fs)
# Mounts S3 buckets on all nodes
# -------------------------------------------------------------------------------------------------
- name: "PHASE 10: Shared Storage Setup (s3fs)"
hosts: master:workers
gather_facts: no
vars:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Mount S3 buckets via s3fs
block:
- name: Install s3fs for mounting S3 buckets
ansible.builtin.apt:
name: s3fs
state: present
become: yes
- name: Configure s3fs credentials
ansible.builtin.copy:
content: "{{ vault_s3_access_key_id }}:{{ vault_s3_secret_access_key }}"
dest: "/home/{{ ansible_user }}/.passwd-s3fs"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0600'
become: yes
- name: Check if mount points are already mounted
ansible.builtin.shell:
cmd: "mount | grep -q '{{ item.path }}'"
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
register: mount_check
changed_when: false
failed_when: false
- name: Ensure mount point directories exist (only if not mounted)
ansible.builtin.file:
path: "{{ item.item.path }}"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
loop: "{{ mount_check.results }}"
when: item.rc != 0
become: yes
- name: Mount S3 buckets for stress testing
ansible.posix.mount:
src: "s3fs#{{ item.bucket }}"
path: "{{ item.path }}"
fstype: fuse
opts: "_netdev,allow_other,use_path_request_style,nonempty,url=http://{{ hostvars[groups['master'][0]].ansible_host }}:9000,passwd_file=/home/{{ ansible_user }}/.passwd-s3fs"
state: mounted
loop:
- { bucket: 'stress-inputs', path: '{{ base_dir }}/inputfiles' }
- { bucket: 'stress-jsons', path: '{{ base_dir }}/run/docker_mount/fetched_info_jsons' }
become: yes
# -------------------------------------------------------------------------------------------------
# PHASE 11: Initialize Redis (Master Only)
# Sets up profiles and policies in Redis
# -------------------------------------------------------------------------------------------------
- name: "PHASE 11: Initialize Redis profiles"
import_playbook: playbook-stress-init-redis.yml
# -------------------------------------------------------------------------------------------------
# PHASE 12: Final Status and Next Steps
# -------------------------------------------------------------------------------------------------
- name: "PHASE 12: Installation Complete"
hosts: localhost
gather_facts: no
tasks:
- name: Display installation completion message
ansible.builtin.debug:
msg: |
========================================
Full installation complete!
========================================
Next steps:
1. Start monitoring and enforcer (on master):
ansible-playbook ansible/playbook-stress-manage-processes.yml \
-e "start_monitor=true start_enforcer=true"
2. Start auth generator (on master):
ansible-playbook ansible/playbook-stress-auth-generator.yml \
-e "start_generator=true dummy_batch=true auth_min_seconds=2 auth_max_seconds=3"
3. Start download simulation (on workers):
ansible-playbook ansible/playbook-stress-download-simulation.yml \
-e "start_download=true profile_prefix=user1 download_min_seconds=2 download_max_seconds=5" \
--limit workers
4. Check status:
ansible-playbook ansible/playbook-stress-control.yml -e "action=status"
5. Monitor profiles:
ansible-playbook ansible/playbook-stress-control.yml -e "action=profile-status"

View File

@ -0,0 +1,18 @@
---
- name: "UTIL-SETUP: Install and configure bgutils container on workers"
hosts: workers
gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Note that bgutil-provider is now on master
ansible.builtin.debug:
msg: "The bgutil-provider service is now deployed on the master node via docker-compose and is no longer deployed on workers."

View File

@ -0,0 +1,165 @@
---
- name: "UTIL-SETUP: Install media cleanup script and configure cron"
hosts: all
gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure cleanup script directory exists
ansible.builtin.file:
path: "{{ base_dir }}/bin"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Copy cleanup_media.py script
ansible.builtin.copy:
src: "{{ playbook_dir }}/../yt-ops-services-debug/cleanup_media.py"
dest: "{{ base_dir }}/bin/cleanup_media.py"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Install s5cmd for S3 uploads
block:
- name: Check if s5cmd is already installed
ansible.builtin.stat:
path: /usr/local/bin/s5cmd
register: s5cmd_binary
- name: Download and install s5cmd
block:
- name: Create temporary directory for s5cmd download
ansible.builtin.tempfile:
state: directory
suffix: s5cmd
register: s5cmd_temp_dir
- name: Download s5cmd
ansible.builtin.get_url:
url: "https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz"
dest: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz"
mode: '0644'
- name: Extract s5cmd
ansible.builtin.unarchive:
src: "{{ s5cmd_temp_dir.path }}/s5cmd.tar.gz"
dest: "{{ s5cmd_temp_dir.path }}"
remote_src: yes
- name: Install s5cmd to /usr/local/bin
ansible.builtin.copy:
src: "{{ s5cmd_temp_dir.path }}/s5cmd"
dest: /usr/local/bin/s5cmd
mode: '0755'
remote_src: yes
- name: Clean up temporary directory
ansible.builtin.file:
path: "{{ s5cmd_temp_dir.path }}"
state: absent
when: not s5cmd_binary.stat.exists
become: yes
- name: Ensure log directory exists
ansible.builtin.file:
path: "/var/log"
state: directory
owner: root
group: root
mode: '0755'
become: yes
- name: Create wrapper script to source .env before running cleanup
ansible.builtin.copy:
content: |
#!/bin/bash
# Wrapper script to run cleanup_media.py with environment variables from .env
set -e
BASE_DIR="{{ base_dir }}"
# Source .env file if it exists
if [ -f "${BASE_DIR}/.env" ]; then
set -a
source "${BASE_DIR}/.env"
set +a
fi
# Determine cleanup mode based on environment variable or default
CLEANUP_MODE="${CLEANUP_MODE:-{{ cleanup_settings.mode | default('s3-upload') }}}"
# Run cleanup script
cd "${BASE_DIR}"
if [ "$CLEANUP_MODE" = "s3-upload" ]; then
# S3 upload mode - uploads to S3 then deletes
exec python3 "${BASE_DIR}/bin/cleanup_media.py" \
--target-dir "${BASE_DIR}/run" \
--target-dir "${BASE_DIR}/downloadfiles" \
--max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \
--log-file /var/log/cleanup_media.log \
--s3-upload \
--s3-bucket "${S3_BUCKET:-stress-media-archive}" \
--s3-prefix "archived-media/$(hostname)" \
--s5cmd-path /usr/local/bin/s5cmd
else
# Simple cleanup mode - just truncate and rename
exec python3 "${BASE_DIR}/bin/cleanup_media.py" \
--target-dir "${BASE_DIR}/run" \
--target-dir "${BASE_DIR}/downloadfiles" \
--max-age {{ cleanup_settings.max_age_seconds | default(3600) }} \
--log-file /var/log/cleanup_media.log
fi
dest: "{{ base_dir }}/bin/cleanup_media_wrapper.sh"
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Configure cron job for media cleanup
ansible.builtin.cron:
name: "Media cleanup - {{ base_dir }}"
minute: "0"
hour: "*"
job: "{{ base_dir }}/bin/cleanup_media_wrapper.sh 2>&1 | logger -t cleanup_media"
user: "{{ ansible_user }}"
state: "{{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }}"
become: yes
- name: Display installation summary
ansible.builtin.debug:
msg: |
Media cleanup script installed successfully on {{ inventory_hostname }}
Configuration from cluster.green.yml:
- Base directory: {{ base_dir }}
- Enabled: {{ cleanup_settings.enabled | default(true) }}
- Cleanup mode: {{ cleanup_settings.mode | default('s-upload') }}
- Max age: {{ cleanup_settings.max_age_seconds | default(3600) }} seconds
- Cron schedule: Every hour (0 * * * *)
- Cron job state: {{ 'present' if cleanup_settings.enabled | default(true) | bool else 'absent' }}
- Log file: /var/log/cleanup_media.log
Note: You can override the cleanup mode for a single run by setting the
CLEANUP_MODE environment variable before executing the wrapper script.
e.g., CLEANUP_MODE=cleanup {{ base_dir }}/bin/cleanup_media_wrapper.sh
To view logs:
tail -f /var/log/cleanup_media.log

View File

@ -1,7 +1,17 @@
--- ---
- name: Deploy Shadowsocks-Rust Proxy Configurations - name: Deploy Shadowsocks-Rust Proxy Configurations
hosts: all hosts: workers
gather_facts: yes gather_facts: yes
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Deploy Shadowsocks-Rust proxy services - name: Deploy Shadowsocks-Rust proxy services
block: block:
@ -55,11 +65,18 @@
path: /srv/shadowsocks-rust/docker-compose.yaml path: /srv/shadowsocks-rust/docker-compose.yaml
state: absent state: absent
- name: Force stop and remove known proxy containers to prevent conflicts - name: Find and stop any container using the target proxy ports
community.docker.docker_container: ansible.builtin.shell:
name: "{{ item.key }}" cmd: |
state: absent container_id=$(docker ps -aq --filter "publish={{ item.value.local_port }}")
if [ -n "$container_id" ]; then
echo "Found container ${container_id} using port {{ item.value.local_port }}. Stopping and removing it."
docker stop "${container_id}" >/dev/null 2>&1 || true
docker rm -f "${container_id}" >/dev/null 2>&1 || true
fi
loop: "{{ shadowsocks_proxies | dict2items }}" loop: "{{ shadowsocks_proxies | dict2items }}"
register: stop_conflicting_containers
changed_when: "'Stopping and removing it' in stop_conflicting_containers.stdout"
loop_control: loop_control:
label: "{{ item.key }}" label: "{{ item.key }}"

View File

@ -0,0 +1,95 @@
---
- name: "STRESS-SETUP: Manage auth simulation generator"
hosts: workers
gather_facts: no
vars:
tmux_session_auth_gen: "stress-auth-{{ (profile_prefix | default('default')) | replace(',', '-') }}"
auth_policy: "policies/12_queue_auth_simulation.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_worker_dir }}"
- name: Validate profile_prefix is provided when starting or stopping
ansible.builtin.fail:
msg: "profile_prefix is required when start_generator=true or stop_generator=true"
when:
- (start_generator | default(false) | bool or stop_generator | default(false) | bool)
- profile_prefix is not defined
- name: "Display policy being used for auth generator"
ansible.builtin.debug:
msg: "Using auth generator policy: {{ auth_policy }}"
when: start_generator | default(false) | bool
- name: Manage auth generator process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_auth_gen }}"
working_dir: "{{ base_dir }}"
command_to_run: >
./bin/ytops-client stress-policy
--policy {{ auth_policy }}
{% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %}
{% if auth_min_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_min_seconds={{ auth_min_seconds }}'{% endif %}
{% if auth_max_seconds is defined %}--set 'settings.dummy_simulation_settings.auth_max_seconds={{ auth_max_seconds }}'{% endif %}
{% if batch_size is defined %}--set 'queue_policy.batch_size={{ batch_size }}'{% endif %}
{% if create_download_tasks is defined %}--set 'queue_policy.create_download_tasks={{ create_download_tasks }}'{% endif %}
{% if formats_to_download is defined %}--set 'queue_policy.formats_to_download={{ formats_to_download }}'{% endif %}
{% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %}
--profile-prefix {{ profile_prefix }}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ auth_policy }}.*--profile-prefix {{ profile_prefix }}"
start_process: "{{ start_generator | default(false) | bool }}"
stop_process: "{{ stop_generator | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"
- name: Check tmux session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_auth_gen }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_auth_gen }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_auth_gen }}' was not found. It may have exited immediately upon starting."
when: tmux_output.rc != 0
when: start_generator | default(false) | bool

View File

@ -0,0 +1,288 @@
---
- name: "STRESS-SETUP: Unified control for stress test processes"
hosts: all
gather_facts: no
vars:
# Default action is status check
action: "status"
setup_policy: "policies/6_profile_setup_policy.yaml"
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
master_only_actions:
- "check-enforcer"
- "profile-status"
- "run-command"
- "cleanup-profiles"
- "restart-monitoring"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
when: action not in master_only_actions or inventory_hostname in groups['master']
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
when: action not in master_only_actions or inventory_hostname in groups['master']
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
when: action not in master_only_actions or inventory_hostname in groups['master']
- name: Check running processes (status action)
ansible.builtin.shell:
cmd: |
echo "=== Process status on {{ inventory_hostname }} ==="
echo "1. Tmux sessions:"
tmux list-sessions 2>/dev/null || echo "No tmux sessions"
echo ""
echo "2. ytops-client processes:"
ps aux | grep -E "ytops-client.*(profile|policy-enforcer|stress-policy)" | grep -v grep || echo "No ytops-client processes"
echo ""
echo "3. Python processes related to stress test:"
ps aux | grep -E "python.*(ytops|stress)" | grep -v grep || echo "No related python processes"
register: status_output
changed_when: false
when: action == "status"
- name: Display status
ansible.builtin.debug:
msg: "{{ status_output.stdout_lines }}"
when: action == "status"
- name: Check enforcer status (check-enforcer action)
block:
- name: Check for enforcer tmux session and process
ansible.builtin.shell:
cmd: |
echo "Enforcer status on {{ inventory_hostname }}:"
if tmux has-session -t stress-enforcer 2>/dev/null; then
echo " - Tmux session 'stress-enforcer' is RUNNING."
ps aux | grep -E "ytops-client.*policy-enforcer" | grep -v grep || echo " - WARNING: Tmux session exists, but no matching process found."
else
echo " - Tmux session 'stress-enforcer' is NOT RUNNING."
fi
register: enforcer_status
changed_when: false
- name: Display enforcer status
ansible.builtin.debug:
msg: "{{ enforcer_status.stdout_lines }}"
when: action == "check-enforcer"
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Run ytops-client profile list on master (profile-status action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
timeout 10 ./bin/ytops-client profile list \
--auth-env sim_auth \
--download-env sim_download 2>&1
register: profile_list_output
changed_when: false
when:
- action == "profile-status"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Show profile list output lines
ansible.builtin.debug:
var: profile_list_output.stdout_lines
when:
- action == "profile-status"
- profile_list_output is defined
- inventory_hostname in groups['master']
- name: Run custom ytops-client command on master (run-command action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
{{ command_to_run }}
register: command_output
changed_when: false
when:
- action == "run-command"
- inventory_hostname in groups['master']
- command_to_run is defined
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Display command output
ansible.builtin.debug:
msg: "Command output:"
when:
- action == "run-command"
- command_output is defined
- name: Show command output lines
ansible.builtin.debug:
var: command_output.stdout_lines
when:
- action == "run-command"
- command_output is defined
- name: "Display policy being used for profile cleanup"
ansible.builtin.debug:
msg: "Using setup policy for cleanup: {{ setup_policy }}"
when:
- action == "cleanup-profiles"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Cleanup profiles on master (cleanup-profiles action)
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
if [ -f .env ]; then
set -a && . ./.env && set +a
fi
./bin/ytops-client setup-profiles \
--policy {{ setup_policy }} \
--cleanup-all {% if profile_prefix is defined %}--profile-prefix {{ profile_prefix }}{% endif %}
register: cleanup_output
changed_when: false
when:
- action == "cleanup-profiles"
- inventory_hostname in groups['master']
delegate_to: "{{ groups['master'][0] }}"
run_once: true
- name: Display cleanup output
ansible.builtin.debug:
msg: "Cleanup output:"
when:
- action == "cleanup-profiles"
- cleanup_output is defined
- name: Show cleanup output lines
ansible.builtin.debug:
var: cleanup_output.stdout_lines
when:
- action == "cleanup-profiles"
- cleanup_output is defined
- name: Stop all stress test processes on all nodes (stop-all action)
block:
- name: Kill all tmux sessions starting with 'stress-'
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
# Failsafe: kill any lingering tmux server processes for stress sessions
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
fi
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client and related python processes
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-all"
- name: Stop processes on targeted nodes only (stop-nodes action)
block:
- name: Kill all tmux sessions starting with 'stress-' on this node
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
# Failsafe: kill any lingering tmux server processes for stress sessions
TMUX_PIDS_TO_KILL=$(ps aux | grep '[t]mux' | grep 'stress-' | grep -v 'grep' | grep -v 'ansible' | awk '{print $2}')
if [ -n "$TMUX_PIDS_TO_KILL" ]; then
kill -9 $TMUX_PIDS_TO_KILL >/dev/null 2>&1 || true
fi
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client and related python processes on this node
ansible.builtin.shell:
cmd: |
# Gracefully terminate processes by pattern
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 1 # Wait for graceful shutdown
# Force kill any remaining processes
ps aux | grep -E "[y]tops-client.*(profile.*list|policy-enforcer|stress-policy)" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ps aux | grep -E "[p]ython.*ytops" | grep -v ansible | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
when: action == "stop-nodes"
- name: Restart monitoring and enforcer (restart-monitoring action)
block:
- name: Stop monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-monitor"
process_grep_pattern: "ytops-client.*profile.*list"
stop_process: true
- name: Stop enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-enforcer"
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
stop_process: true
- name: Start monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-monitor"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client profile list
--auth-env sim_auth
--download-env sim_download
--live
--no-blink
--show-reasons
process_grep_pattern: "ytops-client.*profile.*list"
start_process: true
- name: Start enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "stress-enforcer"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client policy-enforcer
--policy {{ enforcer_policy }}
--live
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
start_process: true
when:
- action == "restart-monitoring"
- inventory_hostname == groups['master'][0]

View File

@ -0,0 +1,93 @@
---
- name: "STRESS-SETUP: Manage download simulation"
hosts: workers
gather_facts: no
vars:
tmux_session_download: "stress-download-{{ (profile_prefix | default('default')) | replace(',', '-') }}"
download_policy: "policies/11_direct_docker_download_simulation.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_worker_dir }}"
- name: Validate profile_prefix is provided when starting or stopping
ansible.builtin.fail:
msg: "profile_prefix is required when start_download=true or stop_download=true"
when:
- (start_download | default(false) | bool or stop_download | default(false) | bool)
- profile_prefix is not defined
- name: "Display policy being used for download simulation"
ansible.builtin.debug:
msg: "Using download simulation policy: {{ download_policy }}"
when: start_download | default(false) | bool
- name: Manage download simulation process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_download }}"
working_dir: "{{ base_dir }}"
command_to_run: >
./bin/ytops-client stress-policy
--policy {{ download_policy }}
{% if dummy_batch | default(true) | bool %}--dummy-batch{% endif %}
{% if download_min_seconds is defined %}--set 'settings.dummy_simulation_settings.download_min_seconds={{ download_min_seconds }}'{% endif %}
{% if download_max_seconds is defined %}--set 'settings.dummy_simulation_settings.download_max_seconds={{ download_max_seconds }}'{% endif %}
{% if profile_prefix is defined %}--set 'execution_control.worker_pools=[{"profile_prefix": "{{ profile_prefix }}", "workers": 1}]'{% endif %}
{% for setting in (extra_set_args | default('[]')) | from_yaml %}--set '{{ setting }}' {% endfor %}
--profile-prefix {{ profile_prefix }}
process_grep_pattern: "ytops-client.*stress-policy.*--policy {{ download_policy }}.*--profile-prefix {{ profile_prefix }}"
start_process: "{{ start_download | default(false) | bool }}"
stop_process: "{{ stop_download | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions on {{ inventory_hostname }}: {{ tmux_sessions.stdout_lines }}"
- name: Check tmux session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_download }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_download }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_download }}' was not found. It may have exited immediately upon starting."
when: tmux_output.rc != 0
when: start_download | default(false) | bool

View File

@ -3,12 +3,19 @@
hosts: all hosts: all
gather_facts: no gather_facts: no
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Define base directory for node - name: Define base directory for node
ansible.builtin.set_fact: ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Create .env file for stress test environment - name: Create .env file for stress test environment
ansible.builtin.template: ansible.builtin.template:
@ -18,15 +25,3 @@
group: "{{ deploy_group }}" group: "{{ deploy_group }}"
mode: '0644' mode: '0644'
become: yes become: yes
- name: Ensure REDIS_PORT is set in .env file
ansible.builtin.lineinfile:
path: "{{ base_dir }}/.env"
line: "REDIS_PORT={{ redis_port }}"
regexp: "^REDIS_PORT="
state: present
create: yes
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0644'
become: yes

View File

@ -0,0 +1,61 @@
---
- name: "STRESS-SETUP: Initialize Redis with profiles and policies"
hosts: master
gather_facts: no
vars:
setup_policy: "policies/6_profile_setup_policy.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Check if Redis is running
ansible.builtin.shell:
cmd: "redis-cli -h {{ hostvars[groups['master'][0]].ansible_host }} -p {{ redis_port }} {% if use_redis_password | default(true) | string | lower == 'true' %}-a {{ vault_redis_password }}{% endif %} ping 2>&1 | grep -q PONG"
register: redis_check
ignore_errors: yes
changed_when: false
- name: Ensure Redis is accessible
ansible.builtin.fail:
msg: "Redis is not accessible on master node. Please ensure Redis service is running on {{ hostvars[groups['master'][0]].ansible_host }}:{{ redis_port }}"
when: redis_check.rc != 0
- name: Stop any running ytops-client processes on master
ansible.builtin.shell:
cmd: ps aux | grep "[y]tops-client" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
changed_when: false
- name: "Display policy being used for Redis initialization"
ansible.builtin.debug:
msg: "Using setup policy: {{ setup_policy }}"
- name: Initialize Redis profiles and policies
ansible.builtin.shell:
cmd: |
cd {{ airflow_master_dir }}
./bin/ytops-client setup-profiles \
--policy {{ setup_policy }} \
--cleanup-all
environment:
REDIS_HOST: "{{ hostvars[groups['master'][0]].ansible_host }}"
REDIS_PORT: "{{ redis_port }}"
REDIS_PASSWORD: "{{ vault_redis_password if use_redis_password | default(true) | string | lower == 'true' else '' }}"
register: init_result
changed_when: init_result.rc == 0
- name: Display initialization result
ansible.builtin.debug:
msg: "Redis profile initialization completed successfully"
when: init_result.rc == 0
- name: Handle initialization failure
ansible.builtin.fail:
msg: "Failed to initialize Redis profiles: {{ init_result.stderr }}"
when: init_result.rc != 0

View File

@ -3,9 +3,15 @@
hosts: all hosts: all
gather_facts: yes gather_facts: yes
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks: pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
- name: Ensure python3-pip is installed - name: Ensure python3-pip is installed
block: block:
- name: Install prerequisites for managing repositories - name: Install prerequisites for managing repositories
@ -27,17 +33,22 @@
become: yes become: yes
tasks: tasks:
- name: Install required Python packages - name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Install required Python packages from requirements.txt
ansible.builtin.pip: ansible.builtin.pip:
name: requirements: "{{ base_dir }}/ytops_client/requirements.txt"
- python-dotenv extra_args: "--ignore-installed"
- aria2p become: yes
- tabulate environment:
- redis PIP_BREAK_SYSTEM_PACKAGES: "1"
- PyYAML
- aiothrift - name: Explicitly install the thrift package
- PySocks ansible.builtin.pip:
state: present name: thrift
extra_args: "--ignore-installed"
become: yes become: yes
environment: environment:
PIP_BREAK_SYSTEM_PACKAGES: "1" PIP_BREAK_SYSTEM_PACKAGES: "1"

View File

@ -0,0 +1,113 @@
---
- name: "STRESS-SETUP: Manage full worker lifecycle based on inventory"
hosts: workers
gather_facts: no
vars:
# Default action
action: "status" # Available actions: start, stop, status
tasks:
- name: "Start all configured generators and simulators"
when: action == "start"
block:
- name: "Set combined profile prefixes string"
ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Start single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_generator=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if auth_min_seconds is defined %}-e "auth_min_seconds={{ auth_min_seconds }}"{% endif %}
{% if auth_max_seconds is defined %}-e "auth_max_seconds={{ auth_max_seconds }}"{% endif %}
{% if batch_size is defined %}-e "batch_size={{ batch_size }}"{% endif %}
{% if create_download_tasks is defined %}-e "create_download_tasks={{ create_download_tasks }}"{% endif %}
{% if formats_to_download is defined %}-e "formats_to_download={{ formats_to_download }}"{% endif %}
delegate_to: localhost
changed_when: true
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Start single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "start_download=true"
-e "profile_prefix={{ combined_prefixes }}"
{% if dummy_batch is defined %}-e "dummy_batch={{ dummy_batch }}"{% endif %}
{% if download_min_seconds is defined %}-e "download_min_seconds={{ download_min_seconds }}"{% endif %}
{% if download_max_seconds is defined %}-e "download_max_seconds={{ download_max_seconds }}"{% endif %}
{% if extra_set_args is defined %}-e 'extra_set_args={{ extra_set_args | to_json }}'{% endif %}
delegate_to: localhost
changed_when: true
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Stop all worker generators and simulators"
when: action == "stop"
block:
- name: Kill all tmux sessions starting with 'stress-' on this worker
ansible.builtin.shell:
cmd: |
for session in $(tmux list-sessions -F "#{session_name}" 2>/dev/null | grep -E "^stress-"); do
tmux kill-session -t "$session"
done || true
ignore_errors: yes
changed_when: false
- name: Kill all ytops-client processes on this worker
ansible.builtin.shell:
cmd: |
# Gracefully terminate
ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill >/dev/null 2>&1 || true
sleep 0.5
# Force kill
ps aux | grep "[y]tops-client.*stress-policy" | awk '{print $2}' | xargs kill -9 >/dev/null 2>&1 || true
ignore_errors: yes
changed_when: false
- name: "Check status of all configured generators and simulators"
when: action == "status"
block:
- name: "Set combined profile prefixes string"
ansible.builtin.set_fact:
combined_prefixes: "{{ profile_prefixes | default([]) | join(',') }}"
when: profile_prefixes is defined and profile_prefixes | length > 0
- name: "Check single auth generator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-auth-generator.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "check_status=true"
-e "profile_prefix={{ combined_prefixes }}"
delegate_to: localhost
changed_when: false
when: profile_prefixes is defined and profile_prefixes | length > 0
register: auth_status_check
- name: "Display auth generator status for {{ inventory_hostname }}"
ansible.builtin.debug:
var: auth_status_check.stdout_lines
when: auth_status_check is defined
- name: "Check single download simulator for all profiles: {{ combined_prefixes | default('none') }}"
ansible.builtin.command: >-
ansible-playbook {{ playbook_dir }}/playbook-stress-download-simulation.yml
-i {{ inventory_file }}
--limit {{ inventory_hostname }}
-e "check_status=true"
-e "profile_prefix={{ combined_prefixes }}"
delegate_to: localhost
changed_when: false
when: profile_prefixes is defined and profile_prefixes | length > 0
register: download_status_check
- name: "Display download simulator status for {{ inventory_hostname }}"
ansible.builtin.debug:
var: download_status_check.stdout_lines
when: download_status_check is defined

View File

@ -0,0 +1,111 @@
---
- name: "STRESS-SETUP: Manage tmux sessions for monitoring and enforcer"
hosts: master
gather_facts: no
vars:
tmux_session_monitor: "stress-monitor"
tmux_session_enforcer: "stress-enforcer"
enforcer_policy: "policies/8_unified_simulation_enforcer.yaml"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Manage monitor process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_monitor }}"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client profile list
--auth-env sim_auth
--download-env sim_download
--live
--no-blink
--show-reasons
process_grep_pattern: "ytops-client.*profile.*list"
start_process: "{{ start_monitor | default(false) | bool }}"
stop_process: "{{ stop_sessions | default(false) | bool or stop_monitor | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: Check monitor session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_monitor }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_monitor }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_monitor }}' was not found."
when: tmux_output.rc != 0
when: start_monitor | default(false) | bool
- name: Manage enforcer process
ansible.builtin.include_tasks:
file: manage-processes-tasks.yml
vars:
tmux_session_name: "{{ tmux_session_enforcer }}"
working_dir: "{{ airflow_master_dir }}"
command_to_run: >
./bin/ytops-client policy-enforcer
--policy {{ enforcer_policy }}
--live
process_grep_pattern: "ytops-client.*policy-enforcer.*{{ enforcer_policy }}"
start_process: "{{ start_enforcer | default(false) | bool }}"
stop_process: "{{ stop_sessions | default(false) | bool or stop_enforcer | default(false) | bool }}"
check_status: "{{ vars.check_status | default(false) | bool }}"
- name: Check enforcer session output for errors
block:
- name: Wait for a moment for the process to start
ansible.builtin.pause:
seconds: 2
- name: Capture tmux pane content
ansible.builtin.shell:
cmd: "tmux capture-pane -p -t {{ tmux_session_enforcer }}"
register: tmux_output
changed_when: false
ignore_errors: true
- name: Display tmux pane content if session exists
ansible.builtin.debug:
msg: "Initial output from tmux session '{{ tmux_session_enforcer }}':"
when: tmux_output.rc == 0
- name: Show output lines if session exists
ansible.builtin.debug:
var: tmux_output.stdout_lines
when: tmux_output.rc == 0
- name: Report if session not found
ansible.builtin.debug:
msg: "Tmux session '{{ tmux_session_enforcer }}' was not found."
when: tmux_output.rc != 0
when: start_enforcer | default(false) | bool
- name: List active tmux sessions
ansible.builtin.shell:
cmd: tmux list-sessions 2>/dev/null || true
register: tmux_sessions
changed_when: false
- name: Display active sessions
ansible.builtin.debug:
msg: "Active tmux sessions: {{ tmux_sessions.stdout_lines }}"

View File

@ -2,13 +2,22 @@
- name: "STRESS-SETUP: Sync Local Code" - name: "STRESS-SETUP: Sync Local Code"
hosts: all hosts: all
gather_facts: no gather_facts: no
vars:
ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source"
vars_files: vars_files:
- "group_vars/all/generated_vars.stress.yml"
- "group_vars/all/vault.yml" - "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks: tasks:
- name: Define base directory for node - name: Define base directory for node
ansible.builtin.set_fact: ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['airflow_master'] else airflow_worker_dir }}" base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure base directory exists for code sync - name: Ensure base directory exists for code sync
ansible.builtin.file: ansible.builtin.file:
@ -21,7 +30,7 @@
- name: Sync python packages and directories for stress testing - name: Sync python packages and directories for stress testing
ansible.posix.synchronize: ansible.posix.synchronize:
src: "../ytops_client-source/{{ item }}/" src: "{{ ytops_source_dir }}/{{ item }}/"
dest: "{{ base_dir }}/{{ item }}/" dest: "{{ base_dir }}/{{ item }}/"
rsync_opts: rsync_opts:
- "--delete" - "--delete"
@ -42,7 +51,7 @@
- name: Sync client utility scripts and configs - name: Sync client utility scripts and configs
ansible.posix.synchronize: ansible.posix.synchronize:
src: "../ytops_client-source/{{ item }}" src: "{{ ytops_source_dir }}/{{ item }}"
dest: "{{ base_dir }}/{{ item }}" dest: "{{ base_dir }}/{{ item }}"
perms: yes perms: yes
loop: loop:
@ -56,3 +65,4 @@
- "ytdlp.json" - "ytdlp.json"
become: yes become: yes
become_user: "{{ ansible_user }}" become_user: "{{ ansible_user }}"

View File

@ -0,0 +1,58 @@
---
- name: "STRESS-SETUP: Sync Policies and CLI Configs"
hosts: all
gather_facts: no
vars:
ytops_source_dir: "{{ playbook_dir }}/../ytops_client-source"
vars_files:
- "group_vars/all/vault.yml"
pre_tasks:
- name: Set inventory_env fact
ansible.builtin.set_fact:
inventory_env: "{{ inventory_file | basename | splitext | first | replace('inventory.', '') }}"
- name: Load environment-specific variables
ansible.builtin.include_vars: "{{ item }}"
with_fileglob:
- "group_vars/all/generated_vars{{ '.' + inventory_env if inventory_env else '' }}.yml"
tasks:
- name: Define base directory for node
ansible.builtin.set_fact:
base_dir: "{{ airflow_master_dir if inventory_hostname in groups['master'] else airflow_worker_dir }}"
- name: Ensure policies directory exists
ansible.builtin.file:
path: "{{ base_dir }}/policies"
state: directory
owner: "{{ ansible_user }}"
group: "{{ deploy_group }}"
mode: '0755'
become: yes
- name: Sync policies directory only
ansible.posix.synchronize:
src: "{{ ytops_source_dir }}/policies/"
dest: "{{ base_dir }}/policies/"
rsync_opts:
- "--delete"
- "--exclude=.DS_Store"
- "--exclude=__pycache__"
- "--exclude='*.pyc'"
recursive: yes
perms: yes
become: yes
become_user: "{{ ansible_user }}"
- name: Sync client CLI config files
ansible.posix.synchronize:
src: "{{ ytops_source_dir }}/{{ item }}"
dest: "{{ base_dir }}/{{ item }}"
perms: yes
loop:
- "cli.auth.config"
- "cli.download.config"
become: yes
become_user: "{{ ansible_user }}"
- name: Display sync completion
ansible.builtin.debug:
msg: "Policies and CLI configs synced to {{ base_dir }}"

View File

@ -1,8 +1,16 @@
# This file is managed by Ansible for the stress test environment. # This file is managed by Ansible for the stress test environment.
# --- Network Settings --- # --- Network Settings ---
REDIS_HOST={{ hostvars[groups['airflow_master'][0]].ansible_host }} REDIS_HOST={{ hostvars[groups['master'][0]].ansible_host }}
REDIS_PORT={{ vault_redis_port }}
REDIS_PASSWORD={{ vault_redis_password }} REDIS_PASSWORD={{ vault_redis_password }}
# --- S3 Storage Configuration ---
S3_ACCESS_KEY={{ vault_s3_delivery_access_key_id }}
S3_SECRET_KEY={{ vault_s3_delivery_secret_access_key }}
S3_ENDPOINT={{ vault_s3_delivery_endpoint }}
S3_BUCKET={{ vault_s3_delivery_bucket }}
AWS_REGION={{ vault_s3_delivery_aws_region }}
# --- Account Manager Configuration --- # --- Account Manager Configuration ---
ACCOUNT_ACTIVE_DURATION_MIN=7 ACCOUNT_ACTIVE_DURATION_MIN=7
ACCOUNT_COOLDOWN_DURATION_MIN=30 ACCOUNT_COOLDOWN_DURATION_MIN=30

View File

@ -1,6 +1,7 @@
# Template for stress test master services # Template for stress test master services
name: "stress-services" name: "stress-services"
services: services:
{% if inventory_hostname in groups['master'] %}
redis: redis:
image: redis:7-alpine image: redis:7-alpine
container_name: stress-redis container_name: stress-redis
@ -26,7 +27,7 @@ services:
command: server /data --console-address ":9001" command: server /data --console-address ":9001"
networks: networks:
- {{ docker_network_name }} - {{ docker_network_name }}
{% endif %}
bgutil-provider: bgutil-provider:
image: brainicism/bgutil-ytdlp-pot-provider image: brainicism/bgutil-ytdlp-pot-provider
container_name: bgutil-provider container_name: bgutil-provider

View File

@ -11,7 +11,6 @@ services:
volumes: volumes:
- ./config_ssp_{{ proxy_config.local_port }}/:/etc/shadowsocks-rust/:ro - ./config_ssp_{{ proxy_config.local_port }}/:/etc/shadowsocks-rust/:ro
networks: networks:
- default
- {{ docker_network_name }} - {{ docker_network_name }}
{% endfor %} {% endfor %}

View File

@ -19,7 +19,7 @@ def generate_inventory(cluster_config, inventory_path):
f.write("# Edit cluster.yml and re-run the generator instead.\n\n") f.write("# Edit cluster.yml and re-run the generator instead.\n\n")
# Master group # Master group
f.write("[airflow_master]\n") f.write("[master]\n")
for hostname, config in cluster_config.get('master', {}).items(): for hostname, config in cluster_config.get('master', {}).items():
line = f"{hostname} ansible_host={config['ip']}" line = f"{hostname} ansible_host={config['ip']}"
if 'port' in config: if 'port' in config:
@ -29,7 +29,7 @@ def generate_inventory(cluster_config, inventory_path):
f.write("\n") f.write("\n")
# Workers group (handles case where workers are not defined) # Workers group (handles case where workers are not defined)
f.write("[airflow_workers]\n") f.write("[workers]\n")
for hostname, config in cluster_config.get('workers', {}).items(): for hostname, config in cluster_config.get('workers', {}).items():
line = f"{hostname} ansible_host={config['ip']}" line = f"{hostname} ansible_host={config['ip']}"
if 'port' in config: if 'port' in config:
@ -47,6 +47,11 @@ def generate_host_vars(cluster_config, host_vars_dir):
sys.exit(1) sys.exit(1)
master_ip = list(master_nodes.values())[0]['ip'] master_ip = list(master_nodes.values())[0]['ip']
# Get global vars for aliases
global_vars = cluster_config.get('global_vars', {})
airflow_master_dir = global_vars.get('airflow_master_dir')
airflow_worker_dir = global_vars.get('airflow_worker_dir')
# Get global proxy definitions # Get global proxy definitions
shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {}) shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {})
@ -58,6 +63,8 @@ def generate_host_vars(cluster_config, host_vars_dir):
# Per-node list of proxies to USE # Per-node list of proxies to USE
worker_proxies = config.get('proxies', []) worker_proxies = config.get('proxies', [])
profile_prefixes = config.get('profile_prefixes', [])
cleanup_settings = config.get('cleanup_settings')
with open(host_vars_file, 'w') as f: with open(host_vars_file, 'w') as f:
f.write("---\n") f.write("---\n")
@ -65,6 +72,13 @@ def generate_host_vars(cluster_config, host_vars_dir):
f.write(f"master_host_ip: {master_ip}\n") f.write(f"master_host_ip: {master_ip}\n")
f.write("redis_port: 52909\n") f.write("redis_port: 52909\n")
# Add node-specific directory aliases for template compatibility
# The master path is needed by all nodes for the .env template.
if airflow_master_dir:
f.write(f"airflow_master: \"{airflow_master_dir}\"\n")
if hostname in cluster_config.get('workers', {}) and airflow_worker_dir:
f.write(f"airflow_dl_worker: \"{airflow_worker_dir}\"\n")
# Write the global proxy definitions for deployment # Write the global proxy definitions for deployment
if shadowsocks_proxies: if shadowsocks_proxies:
f.write("shadowsocks_proxies:\n") f.write("shadowsocks_proxies:\n")
@ -81,6 +95,22 @@ def generate_host_vars(cluster_config, host_vars_dir):
for proxy in worker_proxies: for proxy in worker_proxies:
f.write(f" - \"{proxy}\"\n") f.write(f" - \"{proxy}\"\n")
# Write worker-specific profile prefixes
if profile_prefixes:
f.write("profile_prefixes:\n")
for prefix in profile_prefixes:
f.write(f" - \"{prefix}\"\n")
# Write worker-specific cleanup settings (overrides global)
if cleanup_settings:
f.write("cleanup_settings:\n")
if 'enabled' in cleanup_settings:
f.write(f" enabled: {str(cleanup_settings['enabled']).lower()}\n")
if 'mode' in cleanup_settings:
f.write(f" mode: \"{cleanup_settings['mode']}\"\n")
if 'max_age_seconds' in cleanup_settings:
f.write(f" max_age_seconds: {cleanup_settings['max_age_seconds']}\n")
def generate_group_vars(cluster_config, group_vars_path): def generate_group_vars(cluster_config, group_vars_path):
"""Generate group-level variables""" """Generate group-level variables"""
# Create parent directory if it doesn't exist # Create parent directory if it doesn't exist
@ -107,11 +137,11 @@ def generate_group_vars(cluster_config, group_vars_path):
generated_data = { generated_data = {
'master_host_ip': master_ip, 'master_host_ip': master_ip,
'redis_port': 52909, 'redis_port': 52909,
'external_access_ips': external_ips if external_ips else [], 'external_access_ips': external_ips if external_ips else []
'hostvars': all_nodes
} }
generated_data.update(global_vars) generated_data.update(global_vars)
with open(group_vars_path, 'w') as f: with open(group_vars_path, 'w') as f:
f.write("---\n") f.write("---\n")
f.write("# This file is auto-generated by tools/generate-inventory.py\n") f.write("# This file is auto-generated by tools/generate-inventory.py\n")

View File

@ -17,7 +17,8 @@ settings:
urls_file: "inputfiles/urls.rt300.txt" urls_file: "inputfiles/urls.rt300.txt"
# The save directory MUST be inside the docker_host_mount_path for the download # The save directory MUST be inside the docker_host_mount_path for the download
# simulation to be able to find the files. # simulation to be able to find the files.
save_info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" # NOTE: This path is expected to be on an s3fs mount for cross-host communication.
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
# Settings for controlling the behavior of dummy/simulation modes. # Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag. # These values can be overridden at runtime with the --set flag.
@ -88,6 +89,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" # The mount point inside the container docker_container_mount_path: "/config" # The mount point inside the container
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs. # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_cache_path: ".cache/direct_docker_simulation" docker_host_cache_path: ".cache/direct_docker_simulation"
# Path inside the container where the cache is mounted. Should match HOME/.cache # Path inside the container where the cache is mounted. Should match HOME/.cache
docker_container_cache_path: "/config/.cache" docker_container_cache_path: "/config/.cache"

View File

@ -15,7 +15,8 @@ settings:
# This directory should contain info.json files generated by an auth simulation, # This directory should contain info.json files generated by an auth simulation,
# like `10_direct_docker_auth_simulation`. # like `10_direct_docker_auth_simulation`.
# It MUST be inside the docker_host_mount_path. # It MUST be inside the docker_host_mount_path.
info_json_dir: "run/docker_mount/fetched_info_jsons/direct_docker_simulation" # NOTE: This path is expected to be on an s3fs mount for cross-host communication.
info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
#info_json_dir: "run/docker_mount/download_tasks" #info_json_dir: "run/docker_mount/download_tasks"
# Regex to extract the profile name from a task filename. The first capture # Regex to extract the profile name from a task filename. The first capture
# group is used. This is crucial for the task-first locking strategy. # group is used. This is crucial for the task-first locking strategy.
@ -30,6 +31,8 @@ execution_control:
workers: 1 workers: 1
- profile_prefix: "user2" - profile_prefix: "user2"
workers: 1 workers: 1
- profile_prefix: "user3"
workers: 1
# How long a worker should pause if it cannot find an available profile or task. # How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1 worker_polling_interval_seconds: 1
@ -86,6 +89,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" docker_container_mount_path: "/config"
# Path on the HOST where downloaded files will be saved. # Path on the HOST where downloaded files will be saved.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_download_path: "downloaded_media/direct_docker_simulation" docker_host_download_path: "downloaded_media/direct_docker_simulation"
# Path inside the CONTAINER where `docker_host_download_path` is mounted. # Path inside the CONTAINER where `docker_host_download_path` is mounted.
docker_container_download_path: "/downloads" docker_container_download_path: "/downloads"

View File

@ -1,27 +1,82 @@
# Policy: Queue-based Authentication Simulation via Direct Docker Exec # Policy: Queue-based Authentication Simulation
# #
# This policy simulates a continuous stream of info.json fetch requests using # This policy simulates a continuous stream of info.json fetch requests. It pulls
# the 'direct_docker_cli' mode. It pulls URLs from a Redis queue, creates a # URLs from a Redis queue and processes them, acting as the first stage in a
# temporary batch file, and then calls a yt-dlp command inside a running # two-stage simulation. The second stage (downloading) can be handled in one
# Docker container. # of two ways, configured below:
#
# --- WORKFLOW 1: Queue-Auth -> File-Download ---
# - This policy creates info.json files in a shared directory (`save_info_json_dir`).
# - A separate download simulation (e.g., policy 11_direct_docker_download_simulation.yaml)
# watches that directory, picks up the files, and performs the downloads.
# - To enable:
# - Set `create_download_tasks: false`
# - Ensure `save_info_json_dir` points to a shared path.
#
# --- WORKFLOW 2: Queue-Auth -> Queue-Download ---
# - This policy creates download *tasks* and pushes them to another Redis queue.
# - A separate download simulation (e.g., policy 13_queue_download_simulation.yaml)
# pulls tasks from that queue and performs the downloads.
# - To enable:
# - Set `create_download_tasks: true`
# - Configure `download_task_queue` to the correct queue name.
# - Use `download_task_granularity` to control if one task is created per-URL
# or per-format.
# #
name: 12_queue_auth_simulation name: 12_queue_auth_simulation
settings: settings:
mode: fetch_only mode: fetch_only
orchestration_mode: direct_docker_cli orchestration_mode: queue_auth
profile_mode: from_pool_with_lock profile_mode: from_pool_with_lock
# The save directory MUST be inside the docker_host_mount_path. # For Queue-Auth -> File-Download workflow: Directory to save generated info.json files.
save_info_json_dir: "run/docker_mount/fetched_info_jsons/queue_simulation" # A file-based download worker (e.g., policy 11) will watch this directory.
# This directory MUST be inside the docker_host_mount_path.
# NOTE: This path is expected to be on an s3fs mount for cross-host communication.
save_info_json_dir: "run/docker_mount/info_json_tasks/direct_docker_simulation"
execution_control: execution_control:
# Define worker pools for multiple user groups
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
- profile_prefix: "user3"
workers: 1 workers: 1
# How long a worker should pause if it cannot find an available profile to lock. # How long a worker should pause if it cannot find an available profile to lock.
worker_polling_interval_seconds: 1 worker_polling_interval_seconds: 1
# No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability. # No sleep between tasks; throughput is controlled by yt-dlp performance and profile availability.
info_json_generation_policy: info_json_generation_policy:
profile_prefix: "user1" # This setting tells the auth worker how many download tasks will be generated
# per successful info.json. It is used to correctly increment the
# 'pending_downloads' counter on the auth profile.
# Can be an integer, or 'from_download_policy' to automatically count formats
# from the 'download_policy.formats' setting in this same policy file.
downloads_per_url: "from_download_policy"
# (For Queue-Download workflow) Controls how download tasks are created.
#
# "per_format": (Default) Creates one download task for EACH format specified in 'formats_to_download'.
# If `formats_to_download` is "140,299", two download tasks are created, and
# the 'pending_downloads' counter is incremented by 2.
#
# "per_url": Creates a SINGLE download task for the entire URL. The 'formats_to_download'
# string is passed to the download worker as the format selector, but 'pending_downloads'
# is only incremented by 1 for the whole URL.
#
# --- Current Setting ---
download_task_granularity: "per_format"
# --- Alternative Setting (commented out) ---
# download_task_granularity: "per_url"
# profile_prefix is now defined per-pool in execution_control.worker_pools
# However, for queue auth mode, we need a fallback prefix
profile_prefix: "user"
# This section is needed for the 'downloads_per_url: from_download_policy' setting.
# It should mirror the formats being used by the download simulation.
download_policy:
formats: "299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy,140-dashy/140-dashy-0/140"
direct_docker_cli_policy: direct_docker_cli_policy:
# Which simulation environment's profiles to use for locking. # Which simulation environment's profiles to use for locking.
@ -50,6 +105,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" # The mount point inside the container docker_container_mount_path: "/config" # The mount point inside the container
# Host path for persisting cache data (e.g., cookies, sigfuncs) between runs. # Host path for persisting cache data (e.g., cookies, sigfuncs) between runs.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_cache_path: ".cache/queue_auth_simulation" docker_host_cache_path: ".cache/queue_auth_simulation"
# Path inside the container where the cache is mounted. Should match HOME/.cache # Path inside the container where the cache is mounted. Should match HOME/.cache
docker_container_cache_path: "/config/.cache" docker_container_cache_path: "/config/.cache"
@ -109,18 +165,47 @@ direct_docker_cli_policy:
# Template for renaming the final info.json. # Template for renaming the final info.json.
rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json" rename_file_template: "{video_id}-{profile_name}-{proxy}.info.json"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
dummy_simulation_settings:
# Timings for dummy auth simulation (per-URL delay in a batch)
auth_min_seconds: 0.1
auth_max_seconds: 0.5
auth_failure_rate: 0.0
auth_skipped_failure_rate: 0.0
# Timings for dummy download simulation (per-format download time)
download_min_seconds: 1.0
download_max_seconds: 3.0
download_failure_rate: 0.0
download_skipped_failure_rate: 0.0
queue_policy: queue_policy:
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox'). # Set to false to use legacy, unprefixed queue names (e.g., 'queue2_auth_inbox').
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox'). # Set to true (or omit) to use environment-prefixed names (e.g., 'sim_auth_queue2_auth_inbox').
use_env_prefix: false use_env_prefix: false
# If specified, create download tasks for these formats # Queue to pull URLs from
# Can be "all", a specific format ID, or a list of format IDs input_queue: "queue2_auth_inbox"
formats_to_download: "140-dashy/140-dashy-0/140,299-dashy/298-dashy/137-dashy/136-dashy/135-dashy/134-dashy/133-dashy"
# --- Download Handoff Configuration ---
# Set to 'true' for Queue-Auth -> Queue-Download workflow.
# Set to 'false' for Queue-Auth -> File-Download workflow.
create_download_tasks: false
# Queue to push download tasks to (if create_download_tasks is true)
download_task_queue: "queue2_dl_inbox"
# How many tasks a worker should pull from the queue at once. # How many tasks a worker should pull from the queue at once.
# This will become the batch size for the docker run. # This will become the batch size for the docker run.
batch_size: 25 batch_size: 5
# If specified, create download tasks for these formats
# Can be "all", a specific format ID, or a list of format IDs
# Defaults to the formats in download_policy.formats
# Example: formats_to_download: "140-dashy,299-dashy"
# Example: formats_to_download: "all"
# Example: formats_to_download: ["140-dashy", "299-dashy"]
formats_to_download: "from_download_policy"
simulation_parameters: simulation_parameters:
auth_env: "sim_auth" auth_env: "sim_auth"

View File

@ -1,16 +1,22 @@
# Policy: Queue-based Download Simulation via Direct Docker Exec # Policy: Queue-based Download Simulation
# #
# This policy simulates a continuous stream of downloads using the # This policy simulates a continuous stream of downloads. It pulls download tasks
# 'direct_docker_cli' mode with `mode: download_only`. It pulls download # from a Redis queue, where each task typically contains a path to an info.json
# tasks from a Redis queue, each containing a path to an info.json file, # file and a format to download.
# and invokes a yt-dlp command inside a running Docker container to perform #
# the download. # This policy is designed to be the *second stage* of a two-stage simulation,
# consuming tasks produced by an authentication simulation like:
# - `12_queue_auth_simulation.yaml` (when configured for Queue-Download workflow)
#
# It does not matter to this policy whether the auth stage created tasks per-URL
# or per-format; this worker will simply process whatever task it receives from
# the `input_queue`.
# #
name: 13_queue_download_simulation name: 13_queue_download_simulation
settings: settings:
mode: download_only mode: download_only
orchestration_mode: direct_docker_cli orchestration_mode: queue_download
profile_mode: from_pool_with_lock profile_mode: from_pool_with_lock
# In queue mode, info_json_dir is not used to find tasks. # In queue mode, info_json_dir is not used to find tasks.
# However, the paths inside the download tasks must be accessible # However, the paths inside the download tasks must be accessible
@ -19,12 +25,19 @@ settings:
# can be specified in the download task. # can be specified in the download task.
execution_control: execution_control:
workers: 4 # Define worker pools for multiple user groups
worker_pools:
- profile_prefix: "user1"
workers: 1
- profile_prefix: "user2"
workers: 1
- profile_prefix: "user3"
workers: 1
# How long a worker should pause if it cannot find an available profile or task. # How long a worker should pause if it cannot find an available profile or task.
worker_polling_interval_seconds: 1 worker_polling_interval_seconds: 1
download_policy: download_policy:
profile_prefix: "user1" # profile_prefix is now defined per-pool in execution_control.worker_pools
# Default cooldown in seconds if not specified by the enforcer in Redis. # Default cooldown in seconds if not specified by the enforcer in Redis.
# The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy) # The value from Redis (set via `unlock_cooldown_seconds` in the enforcer policy)
# will always take precedence. This is a fallback. # will always take precedence. This is a fallback.
@ -65,6 +78,7 @@ direct_docker_cli_policy:
docker_container_mount_path: "/config" docker_container_mount_path: "/config"
# Path on the HOST where downloaded files will be saved. # Path on the HOST where downloaded files will be saved.
# NOTE: This path should be on a fast, local disk, NOT on s3fs.
docker_host_download_path: "downloaded_media/queue_downloads" docker_host_download_path: "downloaded_media/queue_downloads"
# Path inside the CONTAINER where `docker_host_download_path` is mounted. # Path inside the CONTAINER where `docker_host_download_path` is mounted.
docker_container_download_path: "/downloads" docker_container_download_path: "/downloads"
@ -95,11 +109,36 @@ direct_docker_cli_policy:
- "Invalid data found when processing input" - "Invalid data found when processing input"
- "Error opening input files" - "Error opening input files"
# Settings for controlling the behavior of dummy/simulation modes.
# These values can be overridden at runtime with the --set flag.
dummy_simulation_settings:
# Timings for dummy download simulation (per-format download time)
download_min_seconds: 1.0
download_max_seconds: 3.0
download_failure_rate: 0.0
download_skipped_failure_rate: 0.0
queue_policy: queue_policy:
# Set to false to use legacy, unprefixed queue names (e.g., 'queue2_dl_inbox'). # Set to false to use legacy, unprefixed queue names (e.g., 'queue2_dl_inbox').
# Set to true (or omit) to use environment-prefixed names (e.g., 'sim_download_queue2_dl_inbox'). # Set to true (or omit) to use environment-prefixed names (e.g., 'sim_download_queue2_dl_inbox').
use_env_prefix: false use_env_prefix: false
# Queue to pull download tasks from
input_queue: "queue2_dl_inbox"
# Whether to report completion back to a queue
# Can be true (report all), false (report none), or "success_only"/"failure_only"
report_completion: true
# Queue to report completion to
completion_queue: "queue2_dl_completed"
# Queue to report failures to (always reported regardless of report_completion)
failure_queue: "queue2_dl_fail"
# Queue to report skipped tasks to
skipped_queue: "queue2_dl_skipped"
# How many tasks to process in a batch. For downloads, this should be 1, # How many tasks to process in a batch. For downloads, this should be 1,
# as each worker locks a profile for a single download task. # as each worker locks a profile for a single download task.
batch_size: 1 batch_size: 1

View File

@ -13,13 +13,13 @@ auth_profile_setup:
cleanup_before_run: true cleanup_before_run: true
pools: pools:
- prefix: "user1" - prefix: "user1"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1088:1088"
count: 3 count: 3
- prefix: "user2" - prefix: "user2"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1085:1085"
count: 3 count: 3
- prefix: "user3" - prefix: "user3"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1084:1084"
count: 3 count: 3
# --- Profile setup for the DOWNLOAD simulation --- # --- Profile setup for the DOWNLOAD simulation ---
@ -28,11 +28,11 @@ download_profile_setup:
cleanup_before_run: true cleanup_before_run: true
pools: pools:
- prefix: "user1" - prefix: "user1"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1088:1088"
count: 3 count: 3
- prefix: "user2" - prefix: "user2"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1085:1085"
count: 3 count: 3
- prefix: "user3" - prefix: "user3"
proxy: "sslocal-rust-1092:1092" proxy: "sslocal-rust-1084:1084"
count: 3 count: 3

View File

@ -16,13 +16,6 @@ import threading
import time import time
from urllib.parse import urljoin from urllib.parse import urljoin
try:
import aria2p
from aria2p.utils import human_readable_bytes
import yt_dlp
except ImportError:
print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_aria_tool') logger = logging.getLogger('download_aria_tool')
@ -173,6 +166,14 @@ def parse_aria_args_to_options(args_str):
def main_download_aria(args): def main_download_aria(args):
"""Main logic for the 'download-aria' command.""" """Main logic for the 'download-aria' command."""
try:
import aria2p
from aria2p.utils import human_readable_bytes
import yt_dlp
except ImportError:
print("aria2p or yt-dlp is not installed. Please install them with: pip install aria2p yt-dlp", file=sys.stderr)
return 1
log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.DEBUG if args.verbose else logging.INFO
# Reconfigure root logger to ensure our settings are applied. # Reconfigure root logger to ensure our settings are applied.
for handler in logging.root.handlers[:]: for handler in logging.root.handlers[:]:
@ -405,6 +406,7 @@ def main_download_aria(args):
def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None): def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading a single URL with aria2c.""" """Handle downloading a single URL with aria2c."""
import aria2p
if remote_dir: if remote_dir:
aria_options['dir'] = remote_dir aria_options['dir'] = remote_dir
logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...") logger.info(f"Adding download for format '{args.format}' with URL: {url[:70]}...")
@ -532,6 +534,7 @@ def download_url_aria(args, api, url, filename, aria_options, timeout_seconds, r
def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None): def download_fragments_aria(args, api, target_format, filename, aria_options, timeout_seconds, remote_dir=None):
"""Handle downloading fragmented formats with aria2c.""" """Handle downloading fragmented formats with aria2c."""
import aria2p
logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.") logger.info(f"Format '{args.format}' is fragmented. Adding all fragments to download queue.")
fragment_base_url = target_format.get('fragment_base_url') fragment_base_url = target_format.get('fragment_base_url')
fragments = target_format['fragments'] fragments = target_format['fragments']

View File

@ -16,12 +16,6 @@ import sys
import time import time
from datetime import datetime from datetime import datetime
try:
import yt_dlp
from yt_dlp.utils import match_filter_func
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
sys.exit(1)
logger = logging.getLogger('download_native_py_tool') logger = logging.getLogger('download_native_py_tool')
@ -110,6 +104,8 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args):
Returns a tuple: (success: bool, ytdlp_logger: YTDLPLogger) Returns a tuple: (success: bool, ytdlp_logger: YTDLPLogger)
""" """
import yt_dlp
# Deep copy info_data so we can modify it without affecting other downloads # Deep copy info_data so we can modify it without affecting other downloads
local_info_data = copy.deepcopy(info_data) local_info_data = copy.deepcopy(info_data)
@ -178,6 +174,13 @@ def _download_single_format(format_id, info_data, base_ydl_opts, args):
def main_download_native_py(args): def main_download_native_py(args):
"""Main logic for the 'download-native-py' command.""" """Main logic for the 'download-native-py' command."""
try:
import yt_dlp
from yt_dlp.utils import match_filter_func
except ImportError:
print("yt-dlp is not installed. Please install it with: pip install yt-dlp", file=sys.stderr)
return 1
# All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer. # All logging should go to stderr to keep stdout clean for the final filename, or for binary data with --output-buffer.
log_stream = sys.stderr log_stream = sys.stderr
log_level = logging.DEBUG if args.verbose else logging.INFO log_level = logging.DEBUG if args.verbose else logging.INFO

View File

@ -44,6 +44,7 @@ class PolicyEnforcer:
self.manager = manager self.manager = manager
self.dry_run = dry_run self.dry_run = dry_run
self.actions_taken_this_cycle = 0 self.actions_taken_this_cycle = 0
self._last_wait_log_message = ""
PROXY_REST_REASON = "Proxy resting" PROXY_REST_REASON = "Proxy resting"
@ -248,7 +249,19 @@ class PolicyEnforcer:
# This prevents a deadlock if all groups are just 'Working' but none are in the 'waiting_downloads' state yet. # This prevents a deadlock if all groups are just 'Working' but none are in the 'waiting_downloads' state yet.
if not is_any_group_idle and groups_currently_waiting: if not is_any_group_idle and groups_currently_waiting:
is_system_blocked_by_downloads = True is_system_blocked_by_downloads = True
logger.info(f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free.") log_message = f"System is waiting for downloads to finish in groups: {groups_currently_waiting}. No new profiles will be activated until a group is free."
if log_message != self._last_wait_log_message:
if self._last_wait_log_message:
print(file=sys.stderr) # Newline if we were printing dots
logger.info(log_message)
self._last_wait_log_message = log_message
else:
print(".", end="", file=sys.stderr, flush=True)
else:
# If we are no longer blocked, reset the message tracker
if self._last_wait_log_message:
print(file=sys.stderr) # Newline to clean up after dots
self._last_wait_log_message = ""
if is_system_blocked_by_downloads: if is_system_blocked_by_downloads:
# When blocked, we only want to consider profiles that are in the 'waiting_downloads' state, # When blocked, we only want to consider profiles that are in the 'waiting_downloads' state,
@ -328,6 +341,7 @@ class PolicyEnforcer:
# The final list to check is the sorted ready profiles, followed by the not-ready ones. # The final list to check is the sorted ready profiles, followed by the not-ready ones.
not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', '')))) not_ready_profiles.sort(key=lambda p: (p.get('rest_until', 0), natural_sort_key(p.get('name', ''))))
profiles_to_check = sorted_ready_profiles + not_ready_profiles profiles_to_check = sorted_ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'least_loaded' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
else: # Default 'longest_idle' sort else: # Default 'longest_idle' sort
if strategy not in ['longest_idle']: if strategy not in ['longest_idle']:
@ -352,6 +366,7 @@ class PolicyEnforcer:
# The final list to check will process all ready profiles first, then wait for the not-ready ones. # The final list to check will process all ready profiles first, then wait for the not-ready ones.
profiles_to_check = ready_profiles + not_ready_profiles profiles_to_check = ready_profiles + not_ready_profiles
logger.debug(f"Activation candidates for 'longest_idle' strategy (first 10): {[p['name'] for p in profiles_to_check[:10]]}")
# --- End New Sorting Logic --- # --- End New Sorting Logic ---
# --- New logic: Identify groups with waiting profiles --- # --- New logic: Identify groups with waiting profiles ---
@ -382,7 +397,7 @@ class PolicyEnforcer:
group_name = profile_to_group_map.get(profile_name) group_name = profile_to_group_map.get(profile_name)
# --- New check to prevent activating profiles from a waiting group --- # --- New check to prevent activating profiles from a waiting group ---
if group_name in waiting_group_names and profile.get('rest_reason') != 'waiting_downloads': if group_name in waiting_group_names:
logger.debug(f"Profile '{profile_name}' activation deferred because its group '{group_name}' is waiting for downloads to complete.") logger.debug(f"Profile '{profile_name}' activation deferred because its group '{group_name}' is waiting for downloads to complete.")
continue continue
# --- End new logic --- # --- End new logic ---

View File

@ -102,10 +102,23 @@ class ProfileManager:
self.redis.ping() self.redis.ping()
logger.info(f"Successfully connected to Redis.") logger.info(f"Successfully connected to Redis.")
logger.info(f"Using key prefix: {key_prefix}") logger.info(f"Using key prefix: {key_prefix}")
self._last_lock_warning = ""
except redis.exceptions.ConnectionError as e: except redis.exceptions.ConnectionError as e:
logger.error(f"Failed to connect to Redis at {redis_host}:{redis_port}: {e}") logger.error(f"Failed to connect to Redis at {redis_host}:{redis_port}: {e}")
sys.exit(1) sys.exit(1)
def _log_lock_warning(self, message: str):
"""Logs a lock-related warning, printing dots for repeated messages."""
if message == self._last_lock_warning:
# Use print to avoid logger formatting and newlines
print(".", end="", file=sys.stderr, flush=True)
else:
# If we were printing dots, start a new line before the new message
if self._last_lock_warning:
print(file=sys.stderr)
logger.warning(message)
self._last_lock_warning = message
def _profile_key(self, profile_name: str) -> str: def _profile_key(self, profile_name: str) -> str:
"""Get Redis key for a profile.""" """Get Redis key for a profile."""
return f"{self.key_prefix}profile:{profile_name}" return f"{self.key_prefix}profile:{profile_name}"
@ -288,11 +301,13 @@ class ProfileManager:
'tolerated_error_count': '0', 'tolerated_error_count': '0',
'download_count': '0', 'download_count': '0',
'download_error_count': '0', 'download_error_count': '0',
'predicted_download_count': '0',
'global_success_count': '0', 'global_success_count': '0',
'global_failure_count': '0', 'global_failure_count': '0',
'global_tolerated_error_count': '0', 'global_tolerated_error_count': '0',
'global_download_count': '0', 'global_download_count': '0',
'global_download_error_count': '0', 'global_download_error_count': '0',
'global_predicted_download_count': '0',
'lock_timestamp': '0', 'lock_timestamp': '0',
'lock_owner': '', 'lock_owner': '',
'rest_until': '0', 'rest_until': '0',
@ -328,10 +343,10 @@ class ProfileManager:
# Convert numeric fields # Convert numeric fields
numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count', numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count',
'tolerated_error_count', 'download_count', 'download_error_count', 'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count',
'global_success_count', 'global_failure_count', 'global_success_count', 'global_failure_count',
'global_tolerated_error_count', 'global_download_count', 'global_tolerated_error_count', 'global_download_count',
'global_download_error_count', 'global_download_error_count', 'global_predicted_download_count',
'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at'] 'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at']
for field in numeric_fields: for field in numeric_fields:
if field in data: if field in data:
@ -388,10 +403,10 @@ class ProfileManager:
# --- End batch fetch --- # --- End batch fetch ---
numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count', numeric_fields = ['created_at', 'last_used', 'success_count', 'failure_count',
'tolerated_error_count', 'download_count', 'download_error_count', 'tolerated_error_count', 'download_count', 'download_error_count', 'predicted_download_count',
'global_success_count', 'global_failure_count', 'global_success_count', 'global_failure_count',
'global_tolerated_error_count', 'global_download_count', 'global_tolerated_error_count', 'global_download_count',
'global_download_error_count', 'global_download_error_count', 'global_predicted_download_count',
'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at'] 'lock_timestamp', 'rest_until', 'last_rest_timestamp', 'wait_started_at']
for i, data in enumerate(all_profile_data): for i, data in enumerate(all_profile_data):
@ -528,8 +543,12 @@ class ProfileManager:
return total_deleted return total_deleted
def record_activity(self, name: str, activity_type: str, def record_activity(self, name: str, activity_type: str,
timestamp: Optional[float] = None) -> bool: timestamp: Optional[float] = None, is_dummy: bool = False) -> bool:
"""Record activity (success/failure) for a profile.""" """
Record activity (success/failure) for a profile.
If is_dummy is True, the activity will NOT be recorded for the associated proxy,
preventing dummy failures from triggering proxy-level enforcer actions.
"""
if activity_type not in ['success', 'failure', 'tolerated_error', 'download', 'download_error']: if activity_type not in ['success', 'failure', 'tolerated_error', 'download', 'download_error']:
logger.error(f"Invalid activity type: {activity_type}") logger.error(f"Invalid activity type: {activity_type}")
return False return False
@ -558,7 +577,8 @@ class ProfileManager:
# Keep only last 1000 activities to prevent unbounded growth # Keep only last 1000 activities to prevent unbounded growth
self.redis.zremrangebyrank(activity_key, 0, -1001) self.redis.zremrangebyrank(activity_key, 0, -1001)
# Also record activity for the proxy # Also record activity for the proxy, BUT NOT for dummy activities.
if not is_dummy:
proxy_url = profile.get('proxy') proxy_url = profile.get('proxy')
if proxy_url: if proxy_url:
proxy_activity_key = self._proxy_activity_key(proxy_url, activity_type) proxy_activity_key = self._proxy_activity_key(proxy_url, activity_type)
@ -569,7 +589,36 @@ class ProfileManager:
pipe.execute() pipe.execute()
logger.debug(f"Recorded {activity_type} for proxy '{proxy_url}'") logger.debug(f"Recorded {activity_type} for proxy '{proxy_url}'")
logger.debug(f"Recorded {activity_type} for profile '{name}' at {ts}") log_msg = f"Recorded {activity_type} for profile '{name}' at {ts}"
if is_dummy:
log_msg += " (dummy, proxy activity skipped)"
logger.debug(log_msg)
return True
def record_predicted_downloads(self, name: str, count: int, is_dummy: bool = False) -> bool:
"""Records the number of download tasks predicted/created for a profile."""
if count <= 0:
return False
profile = self.get_profile(name)
if not profile:
logger.error(f"Profile '{name}' not found")
return False
ts = time.time()
# Update counters in profile
profile_key = self._profile_key(name)
self.redis.hincrby(profile_key, 'predicted_download_count', count)
self.redis.hincrby(profile_key, 'global_predicted_download_count', count)
# Update last_used
self.redis.hset(profile_key, 'last_used', str(ts))
log_msg = f"Recorded {count} predicted downloads for profile '{name}'"
if is_dummy:
log_msg += " (dummy, proxy activity skipped)"
logger.debug(log_msg)
return True return True
def get_activity_rate(self, name: str, activity_type: str, def get_activity_rate(self, name: str, activity_type: str,
@ -612,6 +661,7 @@ class ProfileManager:
'tolerated_error_count': '0', 'tolerated_error_count': '0',
'download_count': '0', 'download_count': '0',
'download_error_count': '0', 'download_error_count': '0',
'predicted_download_count': '0',
} }
self.redis.hset(profile_key, mapping=counters_to_reset) self.redis.hset(profile_key, mapping=counters_to_reset)
logger.info(f"Reset session counters for profile '{name}'.") logger.info(f"Reset session counters for profile '{name}'.")
@ -630,19 +680,21 @@ class ProfileManager:
total_tolerated_error = sum(int(p.get('global_tolerated_error_count', 0)) for p in profiles) total_tolerated_error = sum(int(p.get('global_tolerated_error_count', 0)) for p in profiles)
total_downloads = sum(int(p.get('global_download_count', 0)) for p in profiles) total_downloads = sum(int(p.get('global_download_count', 0)) for p in profiles)
total_download_errors = sum(int(p.get('global_download_error_count', 0)) for p in profiles) total_download_errors = sum(int(p.get('global_download_error_count', 0)) for p in profiles)
total_predicted_downloads = sum(int(p.get('global_predicted_download_count', 0)) for p in profiles)
return { return {
'total_success': total_success, 'total_success': total_success,
'total_failure': total_failure, 'total_failure': total_failure,
'total_tolerated_error': total_tolerated_error, 'total_tolerated_error': total_tolerated_error,
'total_downloads': total_downloads, 'total_downloads': total_downloads,
'total_download_errors': total_download_errors, 'total_download_errors': total_download_errors,
'total_predicted_downloads': total_predicted_downloads,
} }
def get_per_proxy_stats(self) -> Dict[str, Dict[str, Any]]: def get_per_proxy_stats(self) -> Dict[str, Dict[str, Any]]:
"""Get aggregated stats per proxy.""" """Get aggregated stats per proxy."""
profiles = self.list_profiles() profiles = self.list_profiles()
proxy_stats = collections.defaultdict(lambda: { proxy_stats = collections.defaultdict(lambda: {
'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'profiles': 0 'success': 0, 'failure': 0, 'tolerated_error': 0, 'downloads': 0, 'download_errors': 0, 'predicted_downloads': 0, 'profiles': 0
}) })
for p in profiles: for p in profiles:
proxy = p.get('proxy') proxy = p.get('proxy')
@ -652,6 +704,7 @@ class ProfileManager:
proxy_stats[proxy]['tolerated_error'] += int(p.get('global_tolerated_error_count', 0)) proxy_stats[proxy]['tolerated_error'] += int(p.get('global_tolerated_error_count', 0))
proxy_stats[proxy]['downloads'] += int(p.get('global_download_count', 0)) proxy_stats[proxy]['downloads'] += int(p.get('global_download_count', 0))
proxy_stats[proxy]['download_errors'] += int(p.get('global_download_error_count', 0)) proxy_stats[proxy]['download_errors'] += int(p.get('global_download_error_count', 0))
proxy_stats[proxy]['predicted_downloads'] += int(p.get('global_predicted_download_count', 0))
proxy_stats[proxy]['profiles'] += 1 proxy_stats[proxy]['profiles'] += 1
return dict(proxy_stats) return dict(proxy_stats)
@ -862,14 +915,14 @@ class ProfileManager:
# Original logic: find all active profiles, optionally filtered by prefix. # Original logic: find all active profiles, optionally filtered by prefix.
active_profiles = self.redis.zrange(self._state_key(ProfileState.ACTIVE.value), 0, -1) active_profiles = self.redis.zrange(self._state_key(ProfileState.ACTIVE.value), 0, -1)
if not active_profiles: if not active_profiles:
logger.warning("No active profiles available to lock.") self._log_lock_warning("No active profiles available to lock.")
self.redis.incr(self._failed_lock_attempts_key()) self.redis.incr(self._failed_lock_attempts_key())
return None return None
if profile_prefix: if profile_prefix:
profiles_to_check = [p for p in active_profiles if p.startswith(profile_prefix)] profiles_to_check = [p for p in active_profiles if p.startswith(profile_prefix)]
if not profiles_to_check: if not profiles_to_check:
logger.warning(f"No active profiles with prefix '{profile_prefix}' available to lock.") self._log_lock_warning(f"No active profiles with prefix '{profile_prefix}' available to lock.")
self.redis.incr(self._failed_lock_attempts_key()) self.redis.incr(self._failed_lock_attempts_key())
return None return None
else: else:
@ -883,9 +936,9 @@ class ProfileManager:
if not full_profiles: if not full_profiles:
if specific_profile_name: if specific_profile_name:
logger.warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).") self._log_lock_warning(f"Profile '{specific_profile_name}' is not eligible for locking (e.g., not ACTIVE or missing).")
else: else:
logger.warning("No active profiles available to lock after filtering.") self._log_lock_warning("No active profiles available to lock after filtering.")
self.redis.incr(self._failed_lock_attempts_key()) self.redis.incr(self._failed_lock_attempts_key())
return None return None
@ -898,7 +951,7 @@ class ProfileManager:
] ]
if not eligible_profiles: if not eligible_profiles:
logger.warning("No active profiles with an active proxy available to lock.") self._log_lock_warning("No active profiles with an active proxy available to lock.")
self.redis.incr(self._failed_lock_attempts_key()) self.redis.incr(self._failed_lock_attempts_key())
return None return None
@ -922,7 +975,7 @@ class ProfileManager:
if not current_state or current_state.upper() != ProfileState.ACTIVE.value: if not current_state or current_state.upper() != ProfileState.ACTIVE.value:
# Another process (enforcer) changed the state. Release lock and try next. # Another process (enforcer) changed the state. Release lock and try next.
self.redis.hdel(locks_key, name) self.redis.hdel(locks_key, name)
logger.warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.") self._log_lock_warning(f"Aborted lock for '{name}'; state changed from ACTIVE to '{current_state}' during lock acquisition.")
continue continue
# State is still ACTIVE, proceed with locking. # State is still ACTIVE, proceed with locking.
@ -937,6 +990,7 @@ class ProfileManager:
sm.lock(owner=owner) sm.lock(owner=owner)
# The on_enter_locked action handles all Redis updates for the profile itself. # The on_enter_locked action handles all Redis updates for the profile itself.
# The logger messages are also in the action. # The logger messages are also in the action.
self._last_lock_warning = "" # Reset on successful lock
return self.get_profile(name) return self.get_profile(name)
except Exception as e: except Exception as e:
# This could be a TransitionNotAllowed error if the state changed, # This could be a TransitionNotAllowed error if the state changed,
@ -946,7 +1000,7 @@ class ProfileManager:
self.redis.hdel(locks_key, name) self.redis.hdel(locks_key, name)
continue continue
logger.warning("Could not lock any active profile (all may have been locked by other workers).") self._log_lock_warning("Could not lock any active profile (all may have been locked by other workers).")
self.redis.incr(self._failed_lock_attempts_key()) self.redis.incr(self._failed_lock_attempts_key())
return None return None
@ -1016,45 +1070,15 @@ class ProfileManager:
# the `on_enter` actions for the current state. We can suppress the initial transition # the `on_enter` actions for the current state. We can suppress the initial transition
# and set the state directly. # and set the state directly.
# WORKAROUND for older statemachine library: # By passing `initial_value`, we tell the state machine to start in the
# Instantiating the machine triggers an initial transition to ACTIVE, which wrongly updates Redis. # profile's actual current state from Redis, instead of executing the
# We let this happen, and then immediately correct the state if it was supposed to be something else. # default `initial=True` transition to ACTIVE. This prevents incorrect
sm = ProfileStateMachine(manager=self, profile_name=name) # state changes and logs during "hydration" of the state machine.
sm = ProfileStateMachine(
# The sm is now in ACTIVE state, and Redis has been updated. If the original state was manager=self,
# LOCKED, we must re-lock it to fix Redis and the state machine object so transitions work. profile_name=name,
if current_state_str == ProfileState.LOCKED.value: initial_value=current_state_str
lock_owner = profile.get('lock_owner', 're-lock-owner') )
try:
# This transition ensures the `on_enter_LOCKED` actions are run, making the
# state consistent in Redis and in the state machine object.
sm.lock(owner=lock_owner)
except Exception as e:
logger.error(f"Failed to re-lock profile '{name}' during state machine hydration: {e}")
# The state is now inconsistent, best to not return a broken machine.
return None
elif current_state_str != sm.current_state.value.upper():
# For any other state, we must manually fix both the state machine object and Redis,
# as the constructor wrongly transitioned to ACTIVE.
# 1. Force state on the machine object. This does not trigger actions.
target_state_obj = next((s for s in sm.states if s.value.upper() == current_state_str), None)
if not target_state_obj:
logger.error(f"Could not find state object for '{current_state_str}' during hydration of '{name}'.")
return None
sm.current_state = target_state_obj
# 2. Manually revert the state in Redis to what it should be.
profile_key = self._profile_key(name)
pipe = self.redis.pipeline()
pipe.hset(profile_key, 'state', current_state_str)
# Atomically move the profile from the incorrect ACTIVE index to the correct one.
# The constructor may have added it to ACTIVE without removing it from its original state index.
pipe.zrem(self._state_key(ProfileState.ACTIVE.value), name)
pipe.zadd(self._state_key(current_state_str), {name: profile.get('last_used', time.time())})
pipe.execute()
logger.debug(f"Corrected state for '{name}' to '{current_state_str}' in object and Redis during hydration.")
return sm return sm
def cleanup_stale_locks(self, max_lock_time_seconds: int) -> int: def cleanup_stale_locks(self, max_lock_time_seconds: int) -> int:
@ -1475,36 +1499,77 @@ def _render_profile_group_summary_table(manager, all_profiles, profile_groups_co
next_up_reason = f"least_loaded (load: {load}, {finish_str})" next_up_reason = f"least_loaded (load: {load}, {finish_str})"
elif profile_selection_strategy == 'longest_idle': elif profile_selection_strategy == 'longest_idle':
# Find the single longest idle profile across all groups # Find groups that don't have an active profile
ready_profiles = [] groups_without_active = []
for group in profile_groups_config:
profiles_in_group = group.get('profiles_in_group', [])
has_active = False
for p_name in profiles_in_group:
p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
has_active = True
break
if not has_active:
groups_without_active.append(group)
# If all groups have active profiles, we need to find which one will be rotated next
# For now, let's find the group with the profile that has been used the longest
if not groups_without_active:
# Find the active profile with the highest request count (closest to rotation)
active_profiles_info = []
for group in profile_groups_config: for group in profile_groups_config:
for p_name in group.get('profiles_in_group', []):
p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.ACTIVE.value, ProfileState.LOCKED.value]:
# Calculate total requests
total_reqs = (
p.get('success_count', 0) + p.get('failure_count', 0) +
p.get('tolerated_error_count', 0) +
p.get('download_count', 0) + p.get('download_error_count', 0)
)
active_profiles_info.append({
'profile': p,
'group': group,
'total_reqs': total_reqs,
'last_used': p.get('last_used', 0)
})
if active_profiles_info:
# Sort by total requests descending (highest first - closest to rotation)
# Then by last_used ascending (oldest first)
active_profiles_info.sort(key=lambda x: (-x['total_reqs'], x['last_used']))
# The first one is most likely to be rotated next
# Find which group should be activated after it
# For simplicity, let's just indicate the current active group
# But this is not ideal
pass
# Find the longest idle profile from groups without active profiles
ready_profiles = []
for group in groups_without_active:
for p_name in group.get('profiles_in_group', []): for p_name in group.get('profiles_in_group', []):
p = all_profiles_by_name.get(p_name) p = all_profiles_by_name.get(p_name)
if p and p['state'] in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value] and p.get('rest_until', 0) <= now and p.get('rest_reason') != 'waiting_downloads': if p and p['state'] in [ProfileState.RESTING.value, ProfileState.COOLDOWN.value] and p.get('rest_until', 0) <= now and p.get('rest_reason') != 'waiting_downloads':
ready_profiles.append(p) ready_profiles.append((p, group))
if ready_profiles: if ready_profiles:
# Sort them according to the 'longest_idle' activation logic # Sort them according to the 'longest_idle' activation logic
unused_profiles = [p for p in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0] unused_profiles = [(p, g) for p, g in ready_profiles if (p.get('success_count', 0) + p.get('failure_count', 0) + p.get('tolerated_error_count', 0) + p.get('download_count', 0) + p.get('download_error_count', 0)) == 0]
used_profiles = [p for p in ready_profiles if p not in unused_profiles] used_profiles = [(p, g) for p, g in ready_profiles if (p, g) not in unused_profiles]
unused_profiles.sort(key=lambda p: natural_sort_key(p.get('name', ''))) unused_profiles.sort(key=lambda x: natural_sort_key(x[0].get('name', '')))
used_profiles.sort(key=lambda p: (p.get('last_used', 0), natural_sort_key(p.get('name', '')))) used_profiles.sort(key=lambda x: (x[0].get('last_used', 0), natural_sort_key(x[0].get('name', ''))))
sorted_ready_profiles = unused_profiles + used_profiles sorted_ready_profiles = unused_profiles + used_profiles
if sorted_ready_profiles: if sorted_ready_profiles:
next_profile = sorted_ready_profiles[0] next_profile, next_group = sorted_ready_profiles[0]
# Find which group it belongs to next_up_group_name = next_group['name']
for group in profile_groups_config:
if next_profile['name'] in group.get('profiles_in_group', []):
next_up_group_name = group['name']
next_up_reason = profile_selection_strategy next_up_reason = profile_selection_strategy
if getattr(args, 'show_reasons', False): if getattr(args, 'show_reasons', False):
last_used_ts = next_profile.get('last_used', 0) last_used_ts = next_profile.get('last_used', 0)
idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used" idle_time_str = f"idle for {format_duration(time.time() - last_used_ts)}" if last_used_ts > 0 else "never used"
next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})" next_up_reason = f"longest_idle (via {next_profile['name']}, {idle_time_str})"
break
# --- End new logic --- # --- End new logic ---
for group in profile_groups_config: for group in profile_groups_config:
@ -1672,9 +1737,11 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
if is_auth_sim: if is_auth_sim:
row.extend([ row.extend([
p.get('success_count', 0), p.get('success_count', 0),
p.get('predicted_download_count', 0),
p.get('failure_count', 0), p.get('failure_count', 0),
p.get('tolerated_error_count', 0), p.get('tolerated_error_count', 0),
p.get('global_success_count', 0), p.get('global_success_count', 0),
p.get('global_predicted_download_count', 0),
p.get('global_failure_count', 0), p.get('global_failure_count', 0),
]) ])
else: # is_download_sim or unknown else: # is_download_sim or unknown
@ -1700,7 +1767,7 @@ def _render_profile_details_table(manager, args, simulation_type, profile_groups
headers = ['Name', 'Proxy', 'State', 'Last Used'] headers = ['Name', 'Proxy', 'State', 'Last Used']
if is_auth_sim: if is_auth_sim:
headers.extend(['AuthOK', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.AuthFail']) headers.extend(['AuthOK', 'DataPred', 'AuthFail', 'Skip.Err', 'Tot.AuthOK', 'Tot.DataPred', 'Tot.AuthFail'])
else: # is_download_sim or unknown else: # is_download_sim or unknown
headers.extend(['DataOK', 'DownFail', 'Skip.Err', 'Tot.DataOK', 'Tot.DownFail']) headers.extend(['DataOK', 'DownFail', 'Skip.Err', 'Tot.DataOK', 'Tot.DownFail'])
@ -1870,7 +1937,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
dl_success_rate = (dl_stats['total_downloads'] / total_dls * 100) if total_dls > 0 else 100 dl_success_rate = (dl_stats['total_downloads'] / total_dls * 100) if total_dls > 0 else 100
global_summary_str = ( global_summary_str = (
f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | " f"Auth: {total_reqs} reqs ({auth_stats['total_success']} OK, {auth_stats.get('total_predicted_downloads', 0)} Tasks, {auth_stats['total_failure']} Fail, {auth_stats['total_tolerated_error']} Tol.Err) | "
f"OK Rate: {success_rate:.2f}% | " f"OK Rate: {success_rate:.2f}% | "
f"Failed Locks: {auth_failed_locks} || " f"Failed Locks: {auth_failed_locks} || "
f"Download: {total_dls} attempts ({dl_stats['total_downloads']} OK, {dl_stats['total_download_errors']} Fail) | " f"Download: {total_dls} attempts ({dl_stats['total_downloads']} OK, {dl_stats['total_download_errors']} Fail) | "
@ -1894,6 +1961,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
astats.get('profiles', 0), astats.get('profiles', 0),
dstats.get('profiles', 0), dstats.get('profiles', 0),
astats.get('success', 0), astats.get('success', 0),
astats.get('predicted_downloads', 0),
astats.get('failure', 0), astats.get('failure', 0),
astats.get('tolerated_error', 0), astats.get('tolerated_error', 0),
dstats.get('downloads', 0), dstats.get('downloads', 0),
@ -1916,7 +1984,7 @@ def _render_merged_view(auth_manager, download_manager, args, file=sys.stdout):
if all_proxies: if all_proxies:
print("\n--- Per-Proxy Stats (Merged) ---", file=file) print("\n--- Per-Proxy Stats (Merged) ---", file=file)
proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)'] proxy_headers = ['Proxy', 'State (A/D)', 'Profiles (A)', 'Profiles (D)', 'AuthOK', 'DataPred', 'AuthFail', 'Skip.Err(A)', 'DataOK', 'DownFail', 'Skip.Err(D)']
print(tabulate(proxy_table_data, headers=proxy_headers, tablefmt='grid'), file=file) print(tabulate(proxy_table_data, headers=proxy_headers, tablefmt='grid'), file=file)
print(f"\n--- Auth Simulation Profile Details ({args.auth_env}) ---", file=file) print(f"\n--- Auth Simulation Profile Details ({args.auth_env}) ---", file=file)

View File

@ -51,10 +51,31 @@ class ProfileStateMachine(StateMachine):
pause = active.to(paused) | locked.to(paused) | resting.to(paused) | cooldown.to(paused) pause = active.to(paused) | locked.to(paused) | resting.to(paused) | cooldown.to(paused)
def __init__(self, manager: ProfileManager, profile_name: str, *args, **kwargs): def __init__(self, manager: ProfileManager, profile_name: str, initial_value=None):
self.manager = manager self.manager = manager
self.profile_name = profile_name self.profile_name = profile_name
super().__init__(*args, **kwargs) # Call parent constructor with model=self
super().__init__(model=self)
# If initial_value is provided, set the current state without triggering transitions
if initial_value is not None:
# Convert to uppercase to match state values
initial_value = initial_value.upper()
# Check if we're not already in this state
# Compare case-insensitively to handle any discrepancies
if self.current_state.value.upper() != initial_value:
# Find the corresponding state object
target_state = None
for state in self.states:
# Compare both .value and .id case-insensitively
if state.value.upper() == initial_value or (hasattr(state, 'id') and state.id.upper() == initial_value):
target_state = state
break
if target_state:
# Set current state without triggering transitions
self.current_state = target_state
else:
# If state not found, log a warning but don't crash
logger.warning(f"Could not find state '{initial_value}' (case-insensitive) for profile '{profile_name}'. Keeping current state '{self.current_state.value}'.")
# --- Action Methods --- # --- Action Methods ---

View File

@ -70,12 +70,14 @@ class QueueManager:
"""Returns the number of items in a queue.""" """Returns the number of items in a queue."""
return self.redis.llen(queue_name) return self.redis.llen(queue_name)
def populate(self, queue_name: str, file_path: str) -> int: def push_from_file(self, queue_name: str, file_path: str, wrap_key: Optional[str] = None) -> int:
"""Populates a queue from a file (text with one item per line, or JSON with an array of strings).""" """Populates a queue from a file (text with one item per line, or JSON with an array of items)."""
count = 0 count = 0
if file_path.lower().endswith('.json'): if file_path.lower().endswith('.json'):
logger.info("Detected JSON file. Attempting to parse as an array of strings.") if wrap_key:
logger.warning("--wrap-file-line-in-json is ignored for JSON files, as they are expected to contain complete items.")
logger.info("Detected JSON file. Attempting to parse as an array of items.")
try: try:
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
@ -83,13 +85,21 @@ class QueueManager:
logger.error("JSON file must contain a list/array.") logger.error("JSON file must contain a list/array.")
return 0 return 0
items_to_add = [str(item).strip() for item in data if str(item).strip()] # Items can be strings or objects. If objects, they should be converted to JSON strings.
items_to_add = []
for item in data:
if isinstance(item, str):
items_to_add.append(item.strip())
else:
items_to_add.append(json.dumps(item))
items_to_add = [item for item in items_to_add if item]
pipe = self.redis.pipeline() pipe = self.redis.pipeline()
for item in items_to_add: for item in items_to_add:
pipe.rpush(queue_name, item) pipe.rpush(queue_name, item)
count += 1 count += 1
if count % 1000 == 0: if count > 0 and count % 1000 == 0:
pipe.execute() pipe.execute()
logger.info(f"Pushed {count} items...") logger.info(f"Pushed {count} items...")
pipe.execute() pipe.execute()
@ -105,9 +115,13 @@ class QueueManager:
for line in f: for line in f:
item = line.strip() item = line.strip()
if item: if item:
pipe.rpush(queue_name, item) if wrap_key:
payload = json.dumps({wrap_key: item})
else:
payload = item
pipe.rpush(queue_name, payload)
count += 1 count += 1
if count % 1000 == 0: if count > 0 and count % 1000 == 0:
pipe.execute() pipe.execute()
logger.info(f"Pushed {count} items...") logger.info(f"Pushed {count} items...")
pipe.execute() pipe.execute()
@ -118,6 +132,45 @@ class QueueManager:
logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.") logger.info(f"Finished. Pushed a total of {count} items to '{queue_name}'.")
return count return count
def push_generated(self, queue_name: str, prefix: str, count: int) -> int:
"""Pushes generated payloads to a queue."""
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%dt%H%M')
pipe = self.redis.pipeline()
pushed_count = 0
for i in range(count):
generated_value = f"{prefix}_{timestamp}_{i:04d}"
payload = json.dumps({"url": generated_value})
pipe.rpush(queue_name, payload)
pushed_count += 1
if pushed_count > 0 and pushed_count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {pushed_count} of {count} items...")
pipe.execute()
logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.")
return pushed_count
def push_static(self, queue_name: str, payload: str, count: int) -> int:
"""Pushes a static payload multiple times to a queue."""
try:
json.loads(payload)
except json.JSONDecodeError:
logger.error(f"Invalid JSON in --payload-json: {payload}")
return 0
pipe = self.redis.pipeline()
pushed_count = 0
for _ in range(count):
pipe.rpush(queue_name, payload)
pushed_count += 1
if pushed_count > 0 and pushed_count % 1000 == 0:
pipe.execute()
logger.info(f"Pushed {pushed_count} of {count} items...")
pipe.execute()
logger.info(f"Finished. Pushed a total of {pushed_count} items to '{queue_name}'.")
return pushed_count
def clear(self, queue_name: str, dump_path: Optional[str] = None) -> int: def clear(self, queue_name: str, dump_path: Optional[str] = None) -> int:
"""Clears a queue, optionally dumping its contents to a file.""" """Clears a queue, optionally dumping its contents to a file."""
size = self.redis.llen(queue_name) size = self.redis.llen(queue_name)
@ -174,10 +227,17 @@ def add_queue_manager_parser(subparsers):
peek_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.") peek_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
peek_parser.add_argument('--count', type=int, default=10, help='Number of items to show (default: 10)') peek_parser.add_argument('--count', type=int, default=10, help='Number of items to show (default: 10)')
# Populate command # Push command
populate_parser = subparsers.add_parser('populate', help='Populate a queue from a file (one item per line).', parents=[common_parser]) push_parser = subparsers.add_parser('push', help='Push items to a queue from a file, a generator, or a static payload.', parents=[common_parser])
populate_parser.add_argument('file_path', help='Path to the file containing items to add.') push_parser.add_argument('queue_name', nargs='?', help="Name of the queue. Defaults to '<env>_stress_inbox'.")
populate_parser.add_argument('--queue-name', help="Name of the queue to populate. Defaults to '<env>_stress_inbox'.") push_parser.add_argument('--count', type=int, default=1, help='Number of items to push (for --payload-json or --generate-payload-prefix).')
source_group = push_parser.add_mutually_exclusive_group(required=True)
source_group.add_argument('--from-file', dest='file_path', help='Path to a file containing items to add (one per line, or a JSON array).')
source_group.add_argument('--payload-json', help='A static JSON payload to push. Use with --count to push multiple times.')
source_group.add_argument('--generate-payload-prefix', help='Generate JSON payloads with a timestamp and counter. Example: {"url": "PREFIX_yyyymmddthhmm_0001"}. Use with --count.')
push_parser.add_argument('--wrap-file-line-in-json', metavar='KEY', help="For text files (--from-file), wrap each line in a JSON object with the specified key (e.g., 'url' -> {\"url\": \"line_content\"}).")
# Clear command # Clear command
clear_parser = subparsers.add_parser('clear', help='Clear a queue, optionally dumping its contents.', parents=[common_parser]) clear_parser = subparsers.add_parser('clear', help='Clear a queue, optionally dumping its contents.', parents=[common_parser])
@ -215,9 +275,9 @@ def main_queue_manager(args):
) )
# For commands that operate on a single queue, set a default name based on the environment if not provided. # For commands that operate on a single queue, set a default name based on the environment if not provided.
is_single_queue_command = args.queue_command in ['peek', 'populate', 'clear'] is_single_queue_command = args.queue_command in ['peek', 'push', 'clear']
if is_single_queue_command: if is_single_queue_command:
# `populate` uses an option (--queue-name), while `peek` and `clear` use a positional argument. # `push`, `peek` and `clear` use a positional argument for queue_name.
# We check for `queue_name` attribute and if it's falsy (None or empty string). # We check for `queue_name` attribute and if it's falsy (None or empty string).
if not getattr(args, 'queue_name', None): if not getattr(args, 'queue_name', None):
default_queue_name = f"{args.env}_stress_inbox" default_queue_name = f"{args.env}_stress_inbox"
@ -244,11 +304,21 @@ def main_queue_manager(args):
print(f"{i+1: >3}: {item}") print(f"{i+1: >3}: {item}")
return 0 return 0
elif args.queue_command == 'populate': elif args.queue_command == 'push':
if args.file_path:
if not os.path.exists(args.file_path): if not os.path.exists(args.file_path):
print(f"Error: File not found at '{args.file_path}'", file=sys.stderr) print(f"Error: File not found at '{args.file_path}'", file=sys.stderr)
return 1 return 1
manager.populate(args.queue_name, args.file_path) if args.count > 1:
logger.warning("--count is ignored when using --from-file.")
manager.push_from_file(args.queue_name, args.file_path, args.wrap_file_line_in_json)
elif args.payload_json:
manager.push_static(args.queue_name, args.payload_json, args.count)
elif args.generate_payload_prefix:
if args.count <= 0:
print("Error: --count must be 1 or greater for --generate-payload-prefix.", file=sys.stderr)
return 1
manager.push_generated(args.queue_name, args.generate_payload_prefix, args.count)
return 0 return 0
elif args.queue_command == 'clear': elif args.queue_command == 'clear':

View File

@ -11,6 +11,9 @@ aria2p
# For reading .env files for configuration # For reading .env files for configuration
python-dotenv==1.0.1 python-dotenv==1.0.1
# For 'direct_docker_cli' orchestration mode in stress-policy
docker
# For SOCKS proxy support in client tools # For SOCKS proxy support in client tools
PySocks PySocks

View File

@ -167,7 +167,7 @@ Overridable Policy Parameters via --set:
parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.')
parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.')
parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)")
parser.add_argument('--profile-prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages.") parser.add_argument('--profile-prefix', '--user-prefix', dest='profile_prefix', help="Shortcut to override the profile prefix for profile locking mode. Affects both auth and download stages. Can be a comma-separated list.")
parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.') parser.add_argument('--start-from-url-index', type=int, help='Start processing from this line number (1-based) in the urls_file. Overrides saved state.')
parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.") parser.add_argument('--expire-time-shift-minutes', type=int, help="Consider URLs expiring in N minutes as expired. Overrides policy.")

View File

@ -53,6 +53,7 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
last_used_profile_name = None last_used_profile_name = None
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
temp_batch_file = None temp_batch_file = None
@ -93,13 +94,24 @@ def run_direct_batch_worker(worker_id, policy, state_manager, args, profile_mana
if profiles_in_prefix: if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}. Pausing for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix}*): {states_summary}."
else: else:
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'. Pausing for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.info(f"{base_log_msg} Pausing for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging --- # --- End diagnostic logging ---
time.sleep(polling_interval) time.sleep(polling_interval)
continue continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name'] profile_name = locked_profile['name']
proxy_url = locked_profile['proxy'] proxy_url = locked_profile['proxy']

View File

@ -113,6 +113,7 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
return [] return []
last_used_profile_name = None last_used_profile_name = None
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
temp_task_dir_host = None temp_task_dir_host = None
@ -120,11 +121,25 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
# --- Variables for robust finalization --- # --- Variables for robust finalization ---
url_batch_len = 0 url_batch_len = 0
batch_started = False batch_started = False
downloads_per_url = 0 # Default to 0, meaning no increment unless configured num_formats_per_url = 0
downloads_to_increment = 0
# --- # ---
try: try:
# 1. Lock a profile # 1. Lock a profile, trying any of the specified prefixes
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
# --- New logic to avoid immediate reuse --- # --- New logic to avoid immediate reuse ---
avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False) avoid_reuse = direct_policy.get('avoid_immediate_profile_reuse', False)
@ -135,9 +150,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5) wait_seconds = direct_policy.get('avoid_reuse_max_wait_seconds', 5)
time.sleep(wait_seconds) time.sleep(wait_seconds)
# After waiting, try to lock again. # After waiting, try to lock again, from any of the available prefixes
logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.") logger.info(f"[Worker {worker_id}] Attempting to lock a new profile after waiting.")
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if locked_profile and locked_profile['name'] == last_used_profile_name: if locked_profile and locked_profile['name'] == last_used_profile_name:
logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.") logger.warning(f"[Worker {worker_id}] Still locking the same profile '{locked_profile['name']}' after waiting. Proceeding to use it to avoid getting stuck.")
@ -153,13 +180,24 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
if profiles_in_prefix: if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. Pool status ({profile_prefix or '*'}*): {states_summary}."
else: else:
logger.info(f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No auth profiles available to lock. No profiles found with prefix '{profile_prefix or '*'}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(f"{base_log_msg} Pausing for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging --- # --- End diagnostic logging ---
time.sleep(polling_interval) time.sleep(polling_interval)
continue continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name'] profile_name = locked_profile['name']
proxy_url = locked_profile['proxy'] proxy_url = locked_profile['proxy']
@ -237,24 +275,33 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
url_batch_len = len(url_batch) url_batch_len = len(url_batch)
batch_started = True batch_started = True
# --- Calculate how many download tasks will be generated --- # --- Calculate how many download tasks will be generated to set the counter ---
# The "pending downloads" counter for an auth profile tracks the number of
# info.json files it has generated that are waiting to be processed.
# Each successful URL fetch creates one info.json file (one task).
# The number of actual media files downloaded from that info.json is
# irrelevant to the auth profile's counter.
downloads_per_url = 0 # Default to 0, meaning no increment unless configured
downloads_per_url_config = gen_policy.get('downloads_per_url') downloads_per_url_config = gen_policy.get('downloads_per_url')
if downloads_per_url_config: num_formats_per_url = 0 # Default to no increment
downloads_per_url = 1 # We just need a flag to enable the logic, the count is per-URL.
if downloads_per_url > 0: if downloads_per_url_config:
# Increment by the number of URLs in the batch. if isinstance(downloads_per_url_config, int):
downloads_to_increment = url_batch_len num_formats_per_url = downloads_per_url_config
profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment) elif downloads_per_url_config == 'from_download_policy':
logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for the upcoming batch of {url_batch_len} URLs.") # Heuristic: count comma-separated groups in the format selector.
# This mirrors how yt-dlp processes multiple format downloads.
d_policy = policy.get('download_policy', {})
formats_str = d_policy.get('formats', '')
if formats_str:
# Each comma separates a group from which one format is downloaded.
num_formats_per_url = formats_str.count(',') + 1
else: else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured. Pending downloads counter will not be incremented for this batch.") num_formats_per_url = 1 # fallback to 1 if formats is empty
elif downloads_per_url_config:
# Fallback for non-int, non-'from_download_policy' truthy values
num_formats_per_url = 1
if num_formats_per_url > 0:
downloads_to_increment = url_batch_len * num_formats_per_url
profile_manager_instance.increment_pending_downloads(profile_name, downloads_to_increment)
logger.info(f"[Worker {worker_id}] [{profile_name}] Preemptively incremented pending downloads by {downloads_to_increment} for a batch of {url_batch_len} URLs ({num_formats_per_url} format(s)/URL).")
else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] 'downloads_per_url' is not configured or is zero. Pending downloads counter will not be incremented for this batch.")
end_idx = start_idx + len(url_batch) end_idx = start_idx + len(url_batch)
logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).") logger.info(f"[Worker {worker_id}] [{profile_name}] Processing batch of {len(url_batch)} URLs (lines {start_idx + 1}-{end_idx} from source).")
@ -480,14 +527,15 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
logger.error(f"Error during immediate post-processing from log line: {e}") logger.error(f"Error during immediate post-processing from log line: {e}")
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
if post_processed_successfully: if post_processed_successfully:
live_success_count += 1 live_success_count += 1
logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.") logger.info(f"[Worker {worker_id}] [{profile_name}] Live success #{live_success_count} detected and post-processed.")
profile_manager_instance.record_activity(profile_name, 'success') profile_manager_instance.record_activity(profile_name, 'success', is_dummy=is_dummy)
else: else:
live_failure_count += 1 live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.") logger.error(f"[Worker {worker_id}] [{profile_name}] Post-processing failed for a successful fetch. Recording as failure.")
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
# --- End immediate post-processing --- # --- End immediate post-processing ---
return False return False
@ -495,9 +543,10 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
for pattern in fatal_error_patterns: for pattern in fatal_error_patterns:
if re.search(pattern, line, re.IGNORECASE): if re.search(pattern, line, re.IGNORECASE):
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1 live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}") logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL error #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
if direct_policy.get('ban_on_fatal_error_in_batch'): if direct_policy.get('ban_on_fatal_error_in_batch'):
logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.") logger.warning(f"Banning profile '{profile_name}' immediately due to fatal error to stop container.")
profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch') profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during batch')
@ -512,16 +561,18 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
for pattern in tolerated_error_patterns: for pattern in tolerated_error_patterns:
if re.search(pattern, line, re.IGNORECASE): if re.search(pattern, line, re.IGNORECASE):
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_tolerated_count += 1 live_tolerated_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED error #{live_tolerated_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
return False return False
# If it's an ERROR: line and not tolerated, it's a failure # If it's an ERROR: line and not tolerated, it's a failure
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1 live_failure_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Live failure #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=is_dummy)
return False return False
@ -544,14 +595,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
rand_val = random.random() rand_val = random.random()
if rand_val < auth_skipped_rate: if rand_val < auth_skipped_rate:
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.") logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating tolerated failure for {video_id}.")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
elif rand_val < (auth_skipped_rate + auth_failure_rate): elif rand_val < (auth_skipped_rate + auth_failure_rate):
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.") logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal failure for {video_id}.")
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=True)
else: else:
# Success # Success
files_created += 1 files_created += 1
profile_manager_instance.record_activity(profile_name, 'success') profile_manager_instance.record_activity(profile_name, 'success', is_dummy=True)
# Create a dummy file in the temp output dir to simulate success # Create a dummy file in the temp output dir to simulate success
dummy_output_path = Path(temp_task_dir_host) / f"{video_id}.info.json" dummy_output_path = Path(temp_task_dir_host) / f"{video_id}.info.json"
@ -644,13 +695,14 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
finally: finally:
if locked_profile and batch_started: if locked_profile and batch_started:
# --- Reconcile pending downloads counter --- # --- Reconcile pending downloads counter ---
if downloads_per_url > 0: if downloads_to_increment > 0:
# We incremented by url_batch_len at the start. # We incremented at the start. Now we adjust for any URLs that failed to produce an info.json.
# Now we adjust for any URLs that failed to produce an info.json. initial_increment = downloads_to_increment
# The adjustment is the number of successes minus the number of attempts. actual_successes = live_success_count # This is count of successful info.jsons
initial_increment = url_batch_len
actual_successes = live_success_count expected_downloads_from_successes = actual_successes * num_formats_per_url
adjustment = actual_successes - initial_increment
adjustment = expected_downloads_from_successes - initial_increment
if adjustment != 0: if adjustment != 0:
# The adjustment will be negative, effectively decrementing the counter for each failure. # The adjustment will be negative, effectively decrementing the counter for each failure.
@ -680,10 +732,21 @@ def run_direct_docker_worker(worker_id, policy, state_manager, args, profile_man
if cooldown: if cooldown:
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.")
# For auth simulation, check if profile has pending downloads
# If so, don't apply cooldown to avoid conflicting with enforcer
profile_data = profile_manager_instance.get_profile(locked_profile['name'])
should_apply_cooldown = cooldown
if profile_data:
pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name'])
rest_reason = profile_data.get('rest_reason')
if pending_downloads > 0 or rest_reason == 'waiting_downloads':
should_apply_cooldown = None
logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.")
unlocked_successfully = profile_manager_instance.unlock_profile( unlocked_successfully = profile_manager_instance.unlock_profile(
locked_profile['name'], locked_profile['name'],
owner=owner_id, owner=owner_id,
rest_for_seconds=cooldown rest_for_seconds=should_apply_cooldown
) )
if not unlocked_successfully: if not unlocked_successfully:
logger.error(f"[Worker {worker_id}] FAILED to unlock profile '{locked_profile['name']}'. The profile may be stuck.") logger.error(f"[Worker {worker_id}] FAILED to unlock profile '{locked_profile['name']}'. The profile may be stuck.")
@ -751,7 +814,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
return [] return []
no_task_streak = 0 no_task_streak = 0
last_used_profile_name = None last_no_task_log_msg = ""
task_counter = 0 task_counter = 0
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
@ -760,6 +823,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
was_banned_by_parser = False was_banned_by_parser = False
task = None task = None
task_id = None task_id = None
downloads_processed_in_task = 0
try: try:
if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode if no_task_streak > 0 and not queue_policy: # Polling only makes sense for file mode
polling_interval = exec_control.get('worker_polling_interval_seconds', 1) polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
@ -769,9 +833,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
if profiles_in_prefix: if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. Pool status ({profile_prefix or '*'}*): {states_summary}."
else: else:
logger.info(f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") base_log_msg = f"[Worker {worker_id}] No tasks found or profiles available. No profiles found with prefix '{profile_prefix or '*'}'."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging --- # --- End diagnostic logging ---
time.sleep(polling_interval) time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue if state_manager.shutdown_event.is_set(): continue
@ -844,17 +917,27 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
if claimed_task_path_host: if claimed_task_path_host:
no_task_streak = 0 no_task_streak = 0
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
auth_profile_name, auth_env = None, None auth_profile_name, auth_env = None, None
info_data = None info_data = None
# --- Read info.json content and metadata first --- # In queue mode, the task object is the primary source of truth for auth metadata.
# This check is conditional because in file-based mode, `task` will be None.
if task:
auth_profile_name = task.get('auth_profile_name')
auth_env = task.get('auth_env')
# --- Read info.json content and use its metadata as a fallback ---
try: try:
with open(claimed_task_path_host, 'r', encoding='utf-8') as f: with open(claimed_task_path_host, 'r', encoding='utf-8') as f:
info_data = json.load(f) info_data = json.load(f)
# This is critical for decrementing the counter in the finally block # Fallback to info.json metadata if not present in the task object.
if not auth_profile_name or not auth_env:
metadata = info_data.get('_ytops_metadata', {}) metadata = info_data.get('_ytops_metadata', {})
auth_profile_name = metadata.get('profile_name') auth_profile_name = auth_profile_name or metadata.get('profile_name')
auth_env = metadata.get('auth_env') auth_env = auth_env or metadata.get('auth_env')
except (IOError, json.JSONDecodeError) as e: except (IOError, json.JSONDecodeError) as e:
logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path_host.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.")
continue # Skip to finally block to unlock profile continue # Skip to finally block to unlock profile
@ -874,7 +957,12 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
source_of_format = "task file" source_of_format = "task file"
if not format_selection: if not format_selection:
format_selection = d_policy.get('formats', '') format_selection = d_policy.get('formats', '')
source_of_format = "policy" source_of_format = "policy (download_policy.formats)"
if not format_selection:
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
format_selection = ytdlp_config_overrides.get('format', '')
source_of_format = "policy (ytdlp_config_overrides.format)"
if not format_selection: if not format_selection:
logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.") logger.warning(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download.")
@ -907,17 +995,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
details = f"Dummy skipped failure for format {format_id}" details = f"Dummy skipped failure for format {format_id}"
error_type = "DummySkippedFailure" error_type = "DummySkippedFailure"
is_tolerated_error = True is_tolerated_error = True
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
elif should_fail_fatal: elif should_fail_fatal:
logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.") logger.warning(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.")
details = f"Dummy fatal failure for format {format_id}" details = f"Dummy fatal failure for format {format_id}"
error_type = "DummyFailure" error_type = "DummyFailure"
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True)
else: else:
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.") logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.")
success = True success = True
details = f"Dummy success for format {format_id}" details = f"Dummy success for format {format_id}"
profile_manager_instance.record_activity(profile_name, 'download') profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True)
downloads_processed_in_task += 1
event = { event = {
'type': 'direct_docker_download', 'type': 'direct_docker_download',
@ -963,7 +1052,7 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
else: else:
logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.") logger.warning(f"[Worker {worker_id}] [{profile_name}] Skipping task '{claimed_task_path_host.name}' because its URL is expired.")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
event = { event = {
'type': 'direct_docker_download', 'profile': profile_name, 'type': 'direct_docker_download', 'profile': profile_name,
@ -1066,23 +1155,23 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
def log_parser_callback(line): def log_parser_callback(line):
nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser nonlocal live_success_count, live_failure_count, live_tolerated_count, was_banned_by_parser
# Success is a high-priority check. Only record one success per task. # Success is a high-priority check.
if '[download] 100% of' in line or 'has already been downloaded' in line: if '[download] 100% of' in line or 'has already been downloaded' in line:
with activity_lock: with activity_lock:
# Only count one success per task is_dummy = args.dummy or args.dummy_batch
if live_success_count == 0:
live_success_count += 1 live_success_count += 1
logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success detected from log.") logger.info(f"[Worker {worker_id}] [{profile_name}] Live download success #{live_success_count} detected from log.")
profile_manager_instance.record_activity(profile_name, 'download') profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy)
return False return False
# Check for fatal patterns # Check for fatal patterns
for pattern in fatal_error_patterns: for pattern in fatal_error_patterns:
if re.search(pattern, line, re.IGNORECASE): if re.search(pattern, line, re.IGNORECASE):
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1 live_failure_count += 1
logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}") logger.error(f"[Worker {worker_id}] [{profile_name}] Live FATAL download error #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
if direct_policy.get('ban_on_fatal_error_in_batch'): if direct_policy.get('ban_on_fatal_error_in_batch'):
logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.") logger.warning(f"Banning profile '{profile_name}' immediately due to fatal download error to stop container.")
profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download') profile_manager_instance.update_profile_state(profile_name, 'BANNED', 'Fatal error during download')
@ -1098,16 +1187,18 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
for pattern in tolerated_error_patterns: for pattern in tolerated_error_patterns:
if re.search(pattern, line, re.IGNORECASE): if re.search(pattern, line, re.IGNORECASE):
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_tolerated_count += 1 live_tolerated_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Live TOLERATED download error #{live_tolerated_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
return False return False
# If it's an ERROR: line and not tolerated, it's a failure # If it's an ERROR: line and not tolerated, it's a failure
with activity_lock: with activity_lock:
is_dummy = args.dummy or args.dummy_batch
live_failure_count += 1 live_failure_count += 1
logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Live download failure #{live_failure_count} detected from log: {line}")
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
return False return False
@ -1149,11 +1240,11 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
final_outcome = "download" final_outcome = "download"
logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.") logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific success/error log line matched, but exit code is 0. Assuming success, but this may indicate a parsing issue.")
# We record a success here as a fallback, in case the log parser missed it. # We record a success here as a fallback, in case the log parser missed it.
profile_manager_instance.record_activity(profile_name, 'download') profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
else: else:
final_outcome = "download_error" final_outcome = "download_error"
logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.") logger.warning(f"[Worker {worker_id}] [{profile_name}] No specific error log line matched, but exit code was {retcode}. Recording a generic download_error.")
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
# --- Airflow Directory Logic --- # --- Airflow Directory Logic ---
if success and d_policy.get('output_to_airflow_ready_dir'): if success and d_policy.get('output_to_airflow_ready_dir'):
@ -1231,6 +1322,9 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details } event = { 'type': 'direct_docker_download', 'profile': profile_name, 'proxy_url': locked_profile['proxy'], 'success': success, 'details': event_details }
state_manager.log_event(event) state_manager.log_event(event)
# Store the number of detected successful downloads to use for decrementing the counter.
downloads_processed_in_task = live_success_count
logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.") logger.info(f"[Worker {worker_id}] [{profile_name}] Task processing complete. Worker will now unlock profile and attempt next task.")
# 6. Clean up task file # 6. Clean up task file
@ -1274,25 +1368,34 @@ def run_direct_docker_download_worker(worker_id, policy, state_manager, args, pr
auth_manager = get_auth_manager(profile_manager_instance, auth_env) auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager: if auth_manager:
try: try:
auth_manager.decrement_pending_downloads(auth_profile_name) # Fallback: if no downloads were counted (e.g., due to an early exit or parsing issue),
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'.") # but the task is being finalized, we must assume all potential downloads for this task
# are "processed" to prevent the auth profile from getting stuck.
if downloads_processed_in_task == 0:
logger.warning(f"[Worker {worker_id}] No downloads were counted for this task. Using policy to determine decrement count to avoid stuck profile.")
ytdlp_config_overrides = direct_policy.get('ytdlp_config_overrides', {})
formats_str = ytdlp_config_overrides.get('format', d_policy.get('formats', ''))
num_formats = formats_str.count(',') + 1 if formats_str else 1
downloads_processed_in_task = num_formats
logger.warning(f"[Worker {worker_id}] Decrementing by fallback count: {downloads_processed_in_task}")
if downloads_processed_in_task > 0:
new_count = auth_manager.increment_pending_downloads(auth_profile_name, -downloads_processed_in_task)
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' by {downloads_processed_in_task} in env '{auth_env}'. New count: {new_count}")
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] Failed to decrement pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}': {e}", exc_info=True) logger.error(f"[Worker {worker_id}] Failed to decrement pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}': {e}", exc_info=True)
else: else:
logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
else: else:
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env})") logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env. Pending downloads counter will not be decremented. (Profile: {auth_profile_name}, Env: {auth_env}) Task had auth_profile_name: {task.get('auth_profile_name')}, auth_env: {task.get('auth_env')}")
if was_banned_by_parser: if was_banned_by_parser:
logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.") logger.info(f"[Worker {worker_id}] Profile '{locked_profile['name']}' was already banned by the log parser. Skipping unlock/cooldown.")
else: else:
last_used_profile_name = locked_profile['name']
cooldown = None cooldown = None
# Only apply cooldown if a task was actually claimed and processed. # Only apply cooldown if a task was actually claimed and processed.
if claimed_task_path_host: if claimed_task_path_host:
# Enforcer is the only point where we configure to apply different policies,
# since we might restart enforcer, but won't restart stress-policy working on auth and downloads simultaneously.
# This is like applying a policy across multiple workers/machines without needing to restart each of them.
# DESIGN: The cooldown duration is not configured in the worker's policy. # DESIGN: The cooldown duration is not configured in the worker's policy.
# Instead, it is read from a central Redis key. This key is set by the # Instead, it is read from a central Redis key. This key is set by the
# policy-enforcer, making the enforcer the single source of truth for # policy-enforcer, making the enforcer the single source of truth for

View File

@ -46,11 +46,14 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
no_task_streak = 0 no_task_streak = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
claimed_task_path = None claimed_task_path = None
auth_profile_name, auth_env = None, None # For finally block auth_profile_name, auth_env = None, None # For finally block
downloads_to_decrement = 0
formats_from_metadata, granularity_from_metadata = [], 'per_format'
try: try:
# 0. If no tasks were found, pause briefly. # 0. If no tasks were found, pause briefly.
if no_task_streak > 0: if no_task_streak > 0:
@ -61,9 +64,19 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if profiles_in_prefix: if profiles_in_prefix:
state_counts = collections.Counter(p['state'] for p in profiles_in_prefix) state_counts = collections.Counter(p['state'] for p in profiles_in_prefix)
states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items())) states_summary = ', '.join(f"{count} {state}" for state, count in sorted(state_counts.items()))
logger.info(f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s. (Streak: {no_task_streak})") base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. Pool status ({profile_prefix or '*'}*): {states_summary}. Pausing for {polling_interval}s."
else: else:
logger.info(f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s. (Streak: {no_task_streak})") base_log_msg = f"[Worker {worker_id}] No tasks found for available profiles. No profiles found with prefix '{profile_prefix or '*'}'. Pausing for {polling_interval}s."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr) # Newline to clean up after dots
full_log_msg = f"{base_log_msg} (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
# --- End diagnostic logging --- # --- End diagnostic logging ---
time.sleep(polling_interval) time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue if state_manager.shutdown_event.is_set(): continue
@ -83,6 +96,9 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if claimed_task_path: if claimed_task_path:
no_task_streak = 0 # Reset streak no_task_streak = 0 # Reset streak
if last_no_task_log_msg:
print(file=sys.stderr) # Newline to clean up after dots
last_no_task_log_msg = ""
# --- Read metadata before processing/deleting file --- # --- Read metadata before processing/deleting file ---
try: try:
@ -91,6 +107,18 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
metadata = info_data.get('_ytops_metadata', {}) metadata = info_data.get('_ytops_metadata', {})
auth_profile_name = metadata.get('profile_name') auth_profile_name = metadata.get('profile_name')
auth_env = metadata.get('auth_env') auth_env = metadata.get('auth_env')
# Use .get() without a default to distinguish "not present" (None) from "present and empty" ([]).
raw_formats_from_metadata = metadata.get('formats_requested')
granularity_from_metadata = metadata.get('download_task_granularity', 'per_format')
# Normalize to a list for simulation logic, while preserving the original raw state for decrement logic.
formats_from_metadata = raw_formats_from_metadata
if formats_from_metadata is None:
formats_from_metadata = []
elif isinstance(formats_from_metadata, str):
formats_from_metadata = [f.strip() for f in formats_from_metadata.split(',')]
elif not isinstance(formats_from_metadata, list):
formats_from_metadata = []
except (IOError, json.JSONDecodeError) as e: except (IOError, json.JSONDecodeError) as e:
logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.") logger.error(f"CRITICAL: Could not read or parse task file '{claimed_task_path.name}': {e}. This task will be skipped, but the pending downloads counter CANNOT be decremented.")
continue # Skip to finally block to unlock profile continue # Skip to finally block to unlock profile
@ -152,10 +180,43 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...") logger.info(f"[Worker {worker_id}] [{profile_name}] Processing task '{claimed_task_path.name}'...")
if args.dummy or args.dummy_batch: if args.dummy or args.dummy_batch:
logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DIRECT DOWNLOAD ==========") logger.info(f"========== [Worker {worker_id}] BEGIN DUMMY DOCKER DOWNLOAD PER-FORMAT SIMULATION ==========")
logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}") logger.info(f"[Worker {worker_id}] Profile: {profile_name} | Task: {claimed_task_path.name}")
logger.info(f"[Worker {worker_id}] Would run command: {' '.join(shlex.quote(s) for s in cmd)}")
logger.info(f"[Worker {worker_id}] With environment: {custom_env}") formats_to_simulate = []
# Prioritize formats from the info.json metadata if the key was present
if raw_formats_from_metadata is not None:
formats_to_simulate = formats_from_metadata
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {len(formats_to_simulate)} formats from info.json: {formats_to_simulate}")
else:
# Fallback to policy file if metadata key is absent
formats_config = d_policy.get('formats')
if isinstance(formats_config, str):
formats_to_simulate = [f.strip() for f in formats_config.split(',')]
elif isinstance(formats_config, list):
formats_to_simulate = formats_config
if not formats_to_simulate and raw_formats_from_metadata is None:
logger.info(f"[Worker {worker_id}] DUMMY: No format specified in task file or policy. Simulating a single download for backward compatibility.")
formats_to_simulate = ['dummy_format']
# For counting purposes, we should still decrement appropriately
if granularity_from_metadata == 'per_url':
downloads_to_decrement = 1
else:
downloads_to_decrement = 1
# If granularity was 'per_url', we only decrement by 1, even if multiple
# format selectors were passed (e.g., as one string).
if granularity_from_metadata == 'per_url':
downloads_to_decrement = 1
else: # per_format or not specified
if raw_formats_from_metadata is not None:
# If key was present, trust the count from metadata. Can be 0.
downloads_to_decrement = len(formats_from_metadata)
else:
# Key was absent. Decrement by the number of formats we are simulating
# from policy, or 1 as a final fallback for old files.
downloads_to_decrement = len(formats_to_simulate) if formats_to_simulate else 1
dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {}) dummy_settings = policy.get('settings', {}).get('dummy_simulation_settings', {})
min_seconds = dummy_settings.get('download_min_seconds', 0.5) min_seconds = dummy_settings.get('download_min_seconds', 0.5)
@ -163,46 +224,39 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
failure_rate = dummy_settings.get('download_failure_rate', 0.0) failure_rate = dummy_settings.get('download_failure_rate', 0.0)
skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0) skipped_rate = dummy_settings.get('download_skipped_failure_rate', 0.0)
time.sleep(random.uniform(min_seconds, max_seconds)) for i, format_id in enumerate(formats_to_simulate):
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for format '{format_id}'...")
sim_duration = random.uniform(min_seconds, max_seconds)
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download for {sim_duration:.2f}s (from policy range {min_seconds}-{max_seconds}s).")
time.sleep(sim_duration)
rand_val = random.random() rand_val = random.random()
should_fail_skipped = rand_val < skipped_rate is_skipped = rand_val < skipped_rate
should_fail_fatal = not should_fail_skipped and rand_val < (skipped_rate + failure_rate) is_fatal = not is_skipped and rand_val < (skipped_rate + failure_rate)
success = False if is_skipped:
details = "" logger.warning(f"[Worker {worker_id}] DUMMY: Simulating skipped download for format '{format_id}'.")
error_type = None profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=True)
is_tolerated_error = False elif is_fatal:
logger.error(f"[Worker {worker_id}] DUMMY: Simulating fatal download failure for format '{format_id}'.")
profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=True)
else: # success
logger.info(f"[Worker {worker_id}] DUMMY: Simulating download success for format '{format_id}'.")
profile_manager_instance.record_activity(profile_name, 'download', is_dummy=True)
if should_fail_skipped: logger.info(f"========== [Worker {worker_id}] END DUMMY DOCKER DOWNLOAD SIMULATION ==========")
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating skipped download failure for task '{claimed_task_path.name}'.")
details = "Dummy skipped failure"
error_type = "DummySkippedFailure"
is_tolerated_error = True
profile_manager_instance.record_activity(profile_name, 'tolerated_error')
elif should_fail_fatal:
logger.warning(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating fatal download failure for task '{claimed_task_path.name}'.")
details = "Dummy fatal failure"
error_type = "DummyFailure"
profile_manager_instance.record_activity(profile_name, 'download_error')
else: else:
logger.info(f"[Worker {worker_id}] [{profile_name}] DUMMY: Simulating download success for task '{claimed_task_path.name}'.") # Determine downloads_to_decrement based on metadata
success = True if granularity_from_metadata == 'per_url':
details = "Dummy success" downloads_to_decrement = 1
profile_manager_instance.record_activity(profile_name, 'download') else: # per_format or not specified
if raw_formats_from_metadata is not None:
event = { # If key was present, trust the count. It can be 0.
'type': 'direct_download', downloads_to_decrement = len(formats_from_metadata)
'profile': profile_name,
'proxy_url': locked_profile['proxy'],
'success': success,
'details': details,
'error_type': error_type,
'is_tolerated_error': is_tolerated_error
}
state_manager.log_event(event)
logger.info(f"========== [Worker {worker_id}] END DUMMY DIRECT DOWNLOAD ==========")
else: else:
# Key was absent, fall back to 1 for old info.jsons
downloads_to_decrement = 1
# --- Real execution --- # --- Real execution ---
logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}") logger.info(f"[Worker {worker_id}] [{profile_name}] Running command: {' '.join(shlex.quote(s) for s in cmd)}")
retcode, stdout, stderr = run_command( retcode, stdout, stderr = run_command(
@ -268,7 +322,13 @@ def run_direct_download_worker(worker_id, policy, state_manager, args, profile_m
if claimed_task_path and auth_profile_name and auth_env: if claimed_task_path and auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env) auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager: if auth_manager:
auth_manager.decrement_pending_downloads(auth_profile_name) if downloads_to_decrement > 0:
auth_manager.increment_pending_downloads(auth_profile_name, count=-downloads_to_decrement)
elif downloads_to_decrement == 0:
logger.warning(f"[Worker {worker_id}] `downloads_to_decrement` was zero. Pending downloads counter for '{auth_profile_name}' will not be changed.")
else:
# This shouldn't happen, but log it
logger.error(f"[Worker {worker_id}] `downloads_to_decrement` was negative: {downloads_to_decrement}")
else: else:
logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.") logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
elif claimed_task_path: elif claimed_task_path:

View File

@ -20,6 +20,7 @@ from copy import deepcopy
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple, Union from typing import Dict, List, Optional, Any, Tuple, Union
from urllib.parse import urlparse, parse_qs
from . import utils as sp_utils from . import utils as sp_utils
from .process_runners import run_command, run_docker_container, get_worker_id from .process_runners import run_command, run_docker_container, get_worker_id
@ -36,6 +37,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
exec_control = policy.get('execution_control', {}) exec_control = policy.get('execution_control', {})
gen_policy = policy.get('info_json_generation_policy', {}) gen_policy = policy.get('info_json_generation_policy', {})
queue_policy = policy.get('queue_policy', {}) queue_policy = policy.get('queue_policy', {})
task_granularity = gen_policy.get('download_task_granularity', 'per_format')
profile_prefix = gen_policy.get('profile_prefix') profile_prefix = gen_policy.get('profile_prefix')
if not profile_prefix: if not profile_prefix:
@ -54,35 +56,58 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
batch_size = queue_policy.get('batch_size', 1) batch_size = queue_policy.get('batch_size', 1)
logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.") logger.info(f"[Worker {worker_id}] Auth worker configured to process tasks in batches of {batch_size}.")
task_counter = 0 task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
tasks = [] tasks = []
try: try:
# 1. Lock a profile FIRST # 1. Get a batch of tasks from the queue FIRST
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) tasks = state_manager.get_auth_tasks_batch(batch_size)
if not tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
base_log_msg = f"[Worker {worker_id}] No tasks available in queue, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stdout, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stdout)
logger.info(base_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval)
continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 2. Lock a profile, trying any of the specified prefixes
locked_profile = None
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
else:
# Fallback for empty/no prefix, which means lock any available profile
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile: if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1) polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No profiles available to lock. Sleeping for {polling_interval}s.") logger.warning(f"[Worker {worker_id}] No profiles available for {len(tasks)} task(s). Re-queueing and sleeping for {polling_interval}s.")
state_manager.add_auth_tasks_batch(tasks)
time.sleep(polling_interval) time.sleep(polling_interval)
continue continue
profile_name = locked_profile['name'] profile_name = locked_profile['name']
proxy_url = locked_profile['proxy'] proxy_url = locked_profile['proxy']
# 2. Get a batch of tasks from the queue
tasks = state_manager.get_auth_tasks_batch(batch_size)
if not tasks:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No tasks available for profile '{profile_name}'. Unlocking and sleeping for {polling_interval}s.")
# Unlock immediately since we have no work to do.
# No cooldown is applied here to make the profile available again quickly.
profile_manager_instance.unlock_profile(profile_name, owner=owner_id)
locked_profile = None # To prevent double-unlock in finally
time.sleep(polling_interval)
continue
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.") logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' to process a batch of {len(tasks)} tasks.")
# 3. Process each task in the batch # 3. Process each task in the batch
@ -129,13 +154,34 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
stderr = "Dummy fatal failure" stderr = "Dummy fatal failure"
else: else:
success = True success = True
video_id = sp_utils.get_video_id(url) or f"dummy_{random.randint(1000, 9999)}" # In dummy mode, robustly extract ID from URL to avoid 'unknown_video_id'
try:
# First, try the robust library function
video_id = sp_utils.get_video_id(url)
if not video_id and url:
# Fallback parsing for dummy URLs like "...?v=dummy_0099"
parsed_url = urlparse(url)
video_id = parse_qs(parsed_url.query).get('v', [None])[0]
# Additional fallback for URLs like "youtube.com/watch?v=dummy_0028"
if not video_id and 'dummy_' in url:
dummy_match = re.search(r'dummy_\d+', url)
if dummy_match:
video_id = dummy_match.group(0)
except Exception:
video_id = None # Ensure video_id is None on parsing failure
# If all extraction methods fail, use the task_id as a reliable fallback.
if not video_id:
logger.debug(f"Could not extract video ID from URL '{url}', using task ID '{task_id}' for dummy data.")
video_id = task_id
info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]} info_data = {'id': video_id, 'title': f'Dummy Video {video_id}', '_dummy': True, 'formats': [{'format_id': '18'}, {'format_id': '140'}]}
else: else:
client, req_params = state_manager.get_client_for_request(profile_name, gen_policy) client, req_params = state_manager.get_client_for_request(profile_name, gen_policy)
cmd = [ cmd = [
sys.executable, '-m', 'ytops_client.cli', 'get-info', sys.executable, '-m', 'ytops_client.cli', 'get-info',
'--client', client, '--profile', profile_name '--client', client or '', '--profile', profile_name
] ]
if proxy_url: if proxy_url:
cmd.extend(['--proxy', proxy_url]) cmd.extend(['--proxy', proxy_url])
@ -173,61 +219,143 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
if success and info_data: if success and info_data:
try: try:
auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '') auth_env_name = profile_manager_instance.key_prefix.replace('_profile_mgmt_', '')
info_data['_ytops_metadata'] = { is_dummy_run = args.dummy or args.dummy_batch
# --- 1. Determine which formats will be requested ---
formats_to_download = queue_policy.get('formats_to_download')
requested_formats_list = []
if formats_to_download:
if task_granularity == 'per_url':
format_selector = formats_to_download
if isinstance(formats_to_download, list):
format_selector = ",".join(formats_to_download)
requested_formats_list = [format_selector]
else: # per_format
formats_source = formats_to_download
if formats_source == 'from_download_policy':
formats_source = policy.get('download_policy', {}).get('formats')
if formats_source == 'all':
requested_formats_list = [f['format_id'] for f in info_data.get('formats', [])]
elif isinstance(formats_source, list):
requested_formats_list = formats_source
elif isinstance(formats_source, str):
# A comma-separated string of format groups
requested_formats_list = [f.strip() for f in formats_source.split(',')]
elif formats_source:
# Handles integer format IDs or other non-string/list values
requested_formats_list = [str(formats_source)]
else:
# formats_source is None or empty
requested_formats_list = []
# --- 2. Create download tasks and save info.json(s) based on granularity ---
download_tasks_to_create = []
created_file_paths = []
if task_granularity == 'per_format' and requested_formats_list:
for i, format_id in enumerate(requested_formats_list):
# Deepcopy to avoid modifying the original info_data in the loop
task_info_data = deepcopy(info_data)
metadata = {
'profile_name': profile_name, 'proxy_url': proxy_url, 'profile_name': profile_name, 'proxy_url': proxy_url,
'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(), 'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(),
'task_id': task_id, 'url': url, 'auth_env': auth_env_name 'task_id': task_id, 'url': url, 'auth_env': auth_env_name,
'formats_requested': [format_id], # This task is for ONE format
'download_task_granularity': task_granularity
} }
if is_dummy_run: metadata['_dummy'] = True
task_info_data['_ytops_metadata'] = metadata
final_path = None final_path = None
if save_dir: if save_dir:
video_id = info_data.get('id', 'unknown') video_id = task_info_data.get('id') or task_id or 'unknown_video_id'
sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy' sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy'
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.json" # Sanitize format_id for use in filename
sanitized_format = re.sub(r'[^a-zA-Z0-9_-]', '_', format_id)
# Add index `i` to guarantee uniqueness even if sanitized format IDs clash
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}-fmt{i}_{sanitized_format}.info.json"
final_path = os.path.join(save_dir, new_name)
with open(final_path, 'w', encoding='utf-8') as f:
json.dump(task_info_data, f, indent=2)
logger.info(f"[Worker {worker_id}] [{profile_name}] Saved format-specific info.json to '{final_path}'")
created_file_paths.append(final_path)
download_tasks_to_create.append({
'info_json_path': final_path, 'format_id': format_id,
'video_id': task_info_data.get('id'), 'url': url,
'auth_profile_name': profile_name, 'proxy_url': proxy_url,
'auth_env': auth_env_name,
'original_task': task
})
else: # per_url or default (including per_format with no formats)
metadata = {
'profile_name': profile_name, 'proxy_url': proxy_url,
'generation_timestamp_utc': datetime.now(timezone.utc).isoformat(),
'task_id': task_id, 'url': url, 'auth_env': auth_env_name,
'formats_requested': requested_formats_list,
'download_task_granularity': task_granularity
}
if is_dummy_run: metadata['_dummy'] = True
info_data['_ytops_metadata'] = metadata
final_path = None
if save_dir:
video_id = info_data.get('id') or task_id or 'unknown_video_id'
sanitized_proxy = re.sub(r'[:/]', '_', proxy_url) if proxy_url else 'noproxy'
new_name = f"{video_id}-{profile_name}-{sanitized_proxy}.info.json"
final_path = os.path.join(save_dir, new_name) final_path = os.path.join(save_dir, new_name)
with open(final_path, 'w', encoding='utf-8') as f: with open(final_path, 'w', encoding='utf-8') as f:
json.dump(info_data, f, indent=2) json.dump(info_data, f, indent=2)
logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'") logger.info(f"[Worker {worker_id}] [{profile_name}] Saved info.json to '{final_path}'")
else: created_file_paths.append(final_path)
# This case means auth-only (no downloads) and no save_dir specified.
# The info.json is not persisted.
logger.debug(f"[Worker {worker_id}] [{profile_name}] Auth-only task succeeded. No save_dir, so info.json is not saved.")
profile_manager_instance.record_activity(profile_name, 'success') # For per_url, requested_formats_list should contain a single format selector string.
# The loop will run once, creating one task.
formats_to_download = queue_policy.get('formats_to_download') if requested_formats_list:
download_tasks = [] for format_selector in requested_formats_list:
if formats_to_download: download_tasks_to_create.append({
task_formats = [] 'info_json_path': final_path, 'format_id': format_selector,
if formats_to_download == 'all':
task_formats = [f['format_id'] for f in info_data.get('formats', [])]
elif isinstance(formats_to_download, list):
task_formats = formats_to_download
else:
task_formats = [str(formats_to_download)]
for format_id in task_formats:
download_tasks.append({
'info_json_path': final_path, 'format_id': format_id,
'video_id': info_data.get('id'), 'url': url, 'video_id': info_data.get('id'), 'url': url,
'auth_profile_name': profile_name, 'proxy_url': proxy_url, 'auth_profile_name': profile_name, 'proxy_url': proxy_url,
'auth_env': auth_env_name, 'auth_env': auth_env_name,
'original_task': task 'original_task': task
}) })
else: # Handle the case where requested_formats_list is empty
logger.debug(f"[Worker {worker_id}] [{profile_name}] No formats requested, no download tasks created.")
if download_tasks:
added_count = state_manager.add_download_tasks_batch(download_tasks) profile_manager_instance.record_activity(profile_name, 'success', is_dummy=(args.dummy or args.dummy_batch))
# --- 3. Dispatch download tasks ---
create_download_tasks = queue_policy.get('create_download_tasks', True)
if download_tasks_to_create:
num_tasks_to_process = 0
if create_download_tasks:
added_count = state_manager.add_download_tasks_batch(download_tasks_to_create)
logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue") logger.info(f"[Worker {worker_id}] [{profile_name}] Added {added_count} download tasks to queue")
profile_manager_instance.increment_pending_downloads(profile_name, count=added_count) num_tasks_to_process = added_count
else:
num_tasks = len(download_tasks_to_create)
logger.info(f"[Worker {worker_id}] [{profile_name}] File-based workflow: created {num_tasks} tasks (no queue tasks created)")
num_tasks_to_process = num_tasks
state_manager.report_auth_success(task_id, { if num_tasks_to_process > 0:
profile_manager_instance.increment_pending_downloads(profile_name, count=num_tasks_to_process)
profile_manager_instance.record_predicted_downloads(profile_name, count=num_tasks_to_process, is_dummy=(args.dummy or args.dummy_batch))
report_payload = {
"url": url, "video_id": info_data.get('id'), "profile_name": profile_name, "url": url, "video_id": info_data.get('id'), "profile_name": profile_name,
"proxy_url": proxy_url, "info_json_path": final_path, "proxy_url": proxy_url, "info_json_path": created_file_paths[0] if len(created_file_paths) == 1 else created_file_paths,
"download_tasks_created": len(download_tasks) "download_tasks_created": len(download_tasks_to_create)
}) }
if is_dummy_run:
report_payload['_dummy'] = True
state_manager.report_auth_success(task_id, report_payload)
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] [{profile_name}] Error processing successful auth result: {e}", exc_info=True)
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url}) state_manager.report_auth_failure(task_id, {"error": f"Error processing info.json: {str(e)}", "url": url})
else: else:
is_bot_error = "Sign in to confirm you're not a bot" in stderr is_bot_error = "Sign in to confirm you're not a bot" in stderr
@ -240,19 +368,19 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
if is_unavailable or is_private or is_deleted or is_dummy_skipped: if is_unavailable or is_private or is_deleted or is_dummy_skipped:
reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped" reason = "Video unavailable" if is_unavailable else "Private video" if is_private else "Video removed" if is_deleted else "Dummy skipped"
logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}") logger.warning(f"[Worker {worker_id}] [{profile_name}] Auth skipped for {url}: {reason}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr}) state_manager.report_auth_skipped(task_id, {"url": url, "reason": reason, "stderr": stderr})
else: else:
error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}" error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else "Dummy fatal failure" if "Dummy fatal failure" in stderr else f"Exit code {retcode}"
logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}") logger.error(f"[Worker {worker_id}] [{profile_name}] Authentication failed ({error_type}): {url}")
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode}) state_manager.report_auth_failure(task_id, {"url": url, "error_type": error_type, "stderr": stderr, "exit_code": retcode})
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] [{profile_name}] Unexpected error processing task {task_id}: {e}", exc_info=True)
if task_id: if task_id:
state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"}) state_manager.report_auth_failure(task_id, {"error": f"Unexpected error: {str(e)}", "url": url or "unknown"})
profile_manager_instance.record_activity(profile_name, 'failure') profile_manager_instance.record_activity(profile_name, 'failure', is_dummy=(args.dummy or args.dummy_batch))
finally: finally:
if temp_task_dir and os.path.exists(temp_task_dir): if temp_task_dir and os.path.exists(temp_task_dir):
shutil.rmtree(temp_task_dir) shutil.rmtree(temp_task_dir)
@ -262,7 +390,7 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] Unexpected error in outer worker loop: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] Unexpected error in outer worker loop: {e}", exc_info=True)
if locked_profile: if locked_profile:
profile_manager_instance.record_activity(locked_profile['name'], 'failure') profile_manager_instance.record_activity(locked_profile['name'], 'failure', is_dummy=(args.dummy or args.dummy_batch))
finally: finally:
if locked_profile: if locked_profile:
@ -276,16 +404,27 @@ def run_queue_auth_worker(worker_id, policy, state_manager, args, profile_manage
elif isinstance(val, int): elif isinstance(val, int):
cooldown = val cooldown = val
except (json.JSONDecodeError, TypeError): except (json.JSONDecodeError, TypeError):
if cooldown_config.isdigit(): if isinstance(cooldown_config, str) and cooldown_config.isdigit():
cooldown = int(cooldown_config) cooldown = int(cooldown_config)
if cooldown: # For auth simulation, check if profile has pending downloads
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {cooldown}s.") # If so, don't apply cooldown to avoid conflicting with enforcer
profile_data = profile_manager_instance.get_profile(locked_profile['name'])
should_apply_cooldown = cooldown
if profile_data:
pending_downloads = profile_manager_instance.get_pending_downloads(locked_profile['name'])
rest_reason = profile_data.get('rest_reason')
if pending_downloads > 0 or rest_reason == 'waiting_downloads':
should_apply_cooldown = None
logger.info(f"[Worker {worker_id}] Auth profile '{locked_profile['name']}' has pending downloads or is waiting for downloads. Not applying cooldown.")
if should_apply_cooldown:
logger.info(f"[Worker {worker_id}] Putting profile '{locked_profile['name']}' into COOLDOWN for {should_apply_cooldown}s.")
profile_manager_instance.unlock_profile( profile_manager_instance.unlock_profile(
locked_profile['name'], locked_profile['name'],
owner=owner_id, owner=owner_id,
rest_for_seconds=cooldown rest_for_seconds=should_apply_cooldown
) )
logger.info(f"[Worker {worker_id}] Queue auth worker exiting.") logger.info(f"[Worker {worker_id}] Queue auth worker exiting.")
@ -311,6 +450,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'") logger.info(f"[Worker {worker_id}] Will save downloads to '{output_dir}'")
task_counter = 0 task_counter = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
@ -322,7 +462,14 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
task = state_manager.get_download_task() task = state_manager.get_download_task()
if not task: if not task:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1) polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.debug(f"[Worker {worker_id}] No download tasks available in queue. Sleeping for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No download tasks available in queue."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.debug(f"{base_log_msg} Sleeping for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval) time.sleep(polling_interval)
continue continue
@ -377,20 +524,45 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
if specific_profile: if specific_profile:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile) locked_profile = profile_manager_instance.lock_profile(owner=owner_id, specific_profile_name=specific_profile)
if not locked_profile: if not locked_profile:
logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile with prefix.") logger.warning(f"[Worker {worker_id}] Could not lock specific profile '{specific_profile}'. Trying any profile from configured prefixes.")
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix)
else: if not locked_profile:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=profile_prefix) # Lock from any of the specified prefixes
prefixes_to_try = []
if profile_prefix:
prefixes_to_try = [p.strip() for p in profile_prefix.split(',') if p.strip()]
if prefixes_to_try:
random.shuffle(prefixes_to_try)
for prefix in prefixes_to_try:
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=prefix)
if locked_profile:
break
if not locked_profile:
# Fallback for empty/no prefix, or if no profile from list was lockable
locked_profile = profile_manager_instance.lock_profile(owner=owner_id, profile_prefix=None)
if not locked_profile: if not locked_profile:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1) polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.warning(f"[Worker {worker_id}] No profiles available for task {task_id}. Re-queueing and sleeping for {polling_interval}s.") base_log_msg = f"[Worker {worker_id}] No profiles available, polling."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
logger.info(f"{base_log_msg} Re-queueing task {task_id} and sleeping for {polling_interval}s.")
last_no_task_log_msg = base_log_msg
# Re-queue the task by adding it back to the inbox. # Re-queue the task by adding it back to the inbox.
state_manager.add_download_tasks_batch([task]) state_manager.add_download_tasks_batch([task])
# The 'in_progress' marker for this attempt will be cleaned up by the finally block. # The 'in_progress' marker for this attempt will be cleaned up by the finally block.
time.sleep(polling_interval) time.sleep(polling_interval)
continue continue
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
profile_name = locked_profile['name'] profile_name = locked_profile['name']
proxy_url = locked_profile['proxy'] proxy_url = locked_profile['proxy']
logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' with proxy '{proxy_url}'") logger.info(f"[Worker {worker_id}] Locked profile '{profile_name}' with proxy '{proxy_url}'")
@ -480,7 +652,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] Failed to move files to Airflow-ready directory: {e}", exc_info=True)
profile_manager_instance.record_activity(profile_name, 'download_success') profile_manager_instance.record_activity(profile_name, 'download', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_success(task_id, { state_manager.report_download_success(task_id, {
"video_id": video_id, "url": url, "format_id": format_id, "video_id": video_id, "url": url, "format_id": format_id,
"profile_name": profile_name, "proxy_url": proxy_url, "profile_name": profile_name, "proxy_url": proxy_url,
@ -508,7 +680,7 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
if is_unavailable or is_format_error: if is_unavailable or is_format_error:
logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}") logger.warning(f"[Worker {worker_id}] Download skipped: {video_id or url} format {format_id}")
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_skipped(task_id, { state_manager.report_download_skipped(task_id, {
"video_id": video_id, "url": url, "format_id": format_id, "video_id": video_id, "url": url, "format_id": format_id,
"reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr "reason": "Video unavailable" if is_unavailable else "Format not available", "stderr": stderr
@ -516,23 +688,13 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
else: else:
error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}" error_type = "Bot detection" if is_bot_error else "Timeout" if is_timeout_error else f"Exit code {retcode}"
logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}") logger.error(f"[Worker {worker_id}] Download failed ({error_type}): {video_id or url} format {format_id}")
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=(args.dummy or args.dummy_batch))
state_manager.report_download_failure(task_id, { state_manager.report_download_failure(task_id, {
"video_id": video_id, "url": url, "format_id": format_id, "video_id": video_id, "url": url, "format_id": format_id,
"error_type": error_type, "stderr": stderr, "exit_code": retcode, "error_type": error_type, "stderr": stderr, "exit_code": retcode,
"original_task": task "original_task": task
}) })
# Decrement pending downloads counter on the original auth profile, regardless of outcome
if auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
auth_manager.decrement_pending_downloads(auth_profile_name)
else:
logger.error(f"Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
elif task: # Only warn if we had a task but couldn't get metadata
logger.warning(f"Could not find auth profile name and/or auth_env in info.json metadata. Pending downloads counter will not be decremented.")
except Exception as e: except Exception as e:
logger.error(f"[Worker {worker_id}] Unexpected error: {e}", exc_info=True) logger.error(f"[Worker {worker_id}] Unexpected error: {e}", exc_info=True)
if task and task_id: if task and task_id:
@ -543,9 +705,22 @@ def run_queue_download_worker(worker_id, policy, state_manager, args, profile_ma
"error": f"Unexpected error: {str(e)}", "original_task": task "error": f"Unexpected error: {str(e)}", "original_task": task
}) })
if locked_profile: if locked_profile:
profile_manager_instance.record_activity(locked_profile['name'], 'download_error') profile_manager_instance.record_activity(locked_profile['name'], 'download_error', is_dummy=(args.dummy or args.dummy_batch))
finally: finally:
# Decrement pending downloads counter on the original auth profile, regardless of outcome.
# This is critical for releasing the auth profile from the 'waiting_downloads' state.
if task: # Only attempt to decrement if a task was actually pulled.
if auth_profile_name and auth_env:
auth_manager = get_auth_manager(profile_manager_instance, auth_env)
if auth_manager:
new_count = auth_manager.decrement_pending_downloads(auth_profile_name)
logger.info(f"[Worker {worker_id}] Decremented pending downloads for auth profile '{auth_profile_name}' in env '{auth_env}'. New count: {new_count}")
else:
logger.error(f"[Worker {worker_id}] Could not get auth profile manager for env '{auth_env}'. Pending downloads counter will not be decremented.")
else:
logger.warning(f"[Worker {worker_id}] Could not find auth profile name and/or auth_env in task metadata. Pending downloads counter will not be decremented.")
if locked_profile: if locked_profile:
cooldown = None cooldown = None
cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds') cooldown_config = profile_manager_instance.get_config('unlock_cooldown_seconds')

View File

@ -904,6 +904,22 @@ class StateManager:
return self.queue_provider.remove_in_progress(self.queue_provider.AUTH_PROGRESS, task_id) return self.queue_provider.remove_in_progress(self.queue_provider.AUTH_PROGRESS, task_id)
def add_auth_task(self, task: Dict) -> bool:
"""Add an authentication task to the queue."""
if not self.queue_provider:
logger.error("Queue provider not initialized")
return False
return self.queue_provider.add_task(self.queue_provider.AUTH_INBOX, task)
def add_auth_tasks_batch(self, tasks: List[Dict]) -> int:
"""Add a batch of authentication tasks to the queue."""
if not self.queue_provider:
logger.error("Queue provider not initialized")
return 0
return self.queue_provider.add_tasks_batch(self.queue_provider.AUTH_INBOX, tasks)
def add_download_task(self, task: Dict) -> bool: def add_download_task(self, task: Dict) -> bool:
"""Add a download task to the queue.""" """Add a download task to the queue."""
if not self.queue_provider: if not self.queue_provider:

View File

@ -40,6 +40,7 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
return [] return []
no_task_streak = 0 no_task_streak = 0
last_no_task_log_msg = ""
while not state_manager.shutdown_event.is_set(): while not state_manager.shutdown_event.is_set():
locked_profile = None locked_profile = None
@ -48,7 +49,15 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
# 0. If no tasks were found previously, pause briefly. # 0. If no tasks were found previously, pause briefly.
if no_task_streak > 0: if no_task_streak > 0:
polling_interval = exec_control.get('worker_polling_interval_seconds', 1) polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.info(f"[Worker {worker_id}] No tasks found in previous attempt(s). Pausing for {polling_interval}s. (Streak: {no_task_streak})") base_log_msg = f"[Worker {worker_id}] No available tasks found for any active profiles."
if base_log_msg == last_no_task_log_msg:
print(".", end="", file=sys.stderr, flush=True)
else:
if last_no_task_log_msg:
print(file=sys.stderr)
full_log_msg = f"{base_log_msg} Pausing for {polling_interval}s. (Streak: {no_task_streak})"
logger.info(full_log_msg)
last_no_task_log_msg = base_log_msg
time.sleep(polling_interval) time.sleep(polling_interval)
if state_manager.shutdown_event.is_set(): continue if state_manager.shutdown_event.is_set(): continue
@ -60,9 +69,6 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
if not locked_profile: if not locked_profile:
# No task/profile combo was available. # No task/profile combo was available.
no_task_streak += 1 no_task_streak += 1
polling_interval = exec_control.get('worker_polling_interval_seconds', 1)
logger.info(f"[Worker {worker_id}] No available tasks found for any active profiles. Pausing for {polling_interval}s.")
time.sleep(polling_interval)
continue continue
profile_name = locked_profile['name'] profile_name = locked_profile['name']
@ -70,6 +76,9 @@ def run_throughput_worker(worker_id, policy, state_manager, args, profile_manage
# We have a task and a lock. # We have a task and a lock.
if claimed_task_path: if claimed_task_path:
no_task_streak = 0 # Reset streak no_task_streak = 0 # Reset streak
if last_no_task_log_msg:
print(file=sys.stderr)
last_no_task_log_msg = ""
# 3. Process the task # 3. Process the task
try: try:
with open(claimed_task_path, 'r', encoding='utf-8') as f: with open(claimed_task_path, 'r', encoding='utf-8') as f:

View File

@ -319,7 +319,7 @@ def apply_overrides(policy, overrides):
return policy return policy
def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None): def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None):
"""Prints a human-readable summary of the effective policy.""" """Prints a human-readable summary of the effective policy."""
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info(f"--- Effective Policy: {name} ---") logger.info(f"--- Effective Policy: {name} ---")
@ -328,6 +328,8 @@ def display_effective_policy(policy, name, sources=None, profile_names=None, ori
orchestration_mode = settings.get('orchestration_mode') orchestration_mode = settings.get('orchestration_mode')
logger.info(f"Mode: {settings.get('mode', 'full_stack')}") logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
if args and args.profile_prefix:
logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}")
if profile_names: if profile_names:
num_profiles = len(profile_names) num_profiles = len(profile_names)
logger.info(f"Profiles found: {num_profiles}") logger.info(f"Profiles found: {num_profiles}")

View File

@ -188,7 +188,7 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy
if not info_json_dir: if not info_json_dir:
return None, None return None, None
logger.info(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...") logger.debug(f"[Worker {worker_id}] Scanning for tasks in '{info_json_dir}'...")
# 1. Get all available task files and group by profile # 1. Get all available task files and group by profile
try: try:
@ -223,7 +223,21 @@ def find_task_and_lock_profile(profile_manager, owner_id, profile_prefix, policy
# 2. Get ACTIVE profiles from Redis. # 2. Get ACTIVE profiles from Redis.
active_profiles = profile_manager.list_profiles(state_filter='ACTIVE') active_profiles = profile_manager.list_profiles(state_filter='ACTIVE')
active_profile_names = {p['name'] for p in active_profiles if p['name'].startswith(profile_prefix or '')}
prefixes_to_check = []
if profile_prefix:
prefixes_to_check = [p.strip() for p in profile_prefix.split(',') if p.strip()]
active_profile_names = set()
if prefixes_to_check:
for p in active_profiles:
for prefix in prefixes_to_check:
if p['name'].startswith(prefix):
active_profile_names.add(p['name'])
break
else:
# If no prefixes are specified, consider all active profiles.
active_profile_names = {p['name'] for p in active_profiles}
# 3. Find profiles that are both ACTIVE and have tasks. # 3. Find profiles that are both ACTIVE and have tasks.
candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys())) candidate_profiles = list(active_profile_names.intersection(tasks_by_profile.keys()))
@ -918,7 +932,8 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_
result['proxy_url'] = proxy_url result['proxy_url'] = proxy_url
if profile_manager_instance and profile_name: if profile_manager_instance and profile_name:
profile_manager_instance.record_activity(profile_name, 'tolerated_error') is_dummy = args.dummy or args.dummy_batch
profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
state_manager.log_event(result) state_manager.log_event(result)
results.append(result) results.append(result)
@ -960,14 +975,15 @@ def process_info_json_cycle(path, content, policy, state_manager, args, running_
# --- Record activity on the DOWNLOAD profile that performed the work --- # --- Record activity on the DOWNLOAD profile that performed the work ---
if profile_manager_instance and profile_name: if profile_manager_instance and profile_name:
is_dummy = args.dummy or args.dummy_batch
if result.get('success'): if result.get('success'):
profile_manager_instance.record_activity(profile_name, 'download') profile_manager_instance.record_activity(profile_name, 'download', is_dummy=is_dummy)
elif result.get('error_type') == 'Cancelled': elif result.get('error_type') == 'Cancelled':
pass # Do not record cancellations pass # Do not record cancellations
elif result.get('is_tolerated_error'): elif result.get('is_tolerated_error'):
profile_manager_instance.record_activity(profile_name, 'tolerated_error') profile_manager_instance.record_activity(profile_name, 'tolerated_error', is_dummy=is_dummy)
else: else:
profile_manager_instance.record_activity(profile_name, 'download_error') profile_manager_instance.record_activity(profile_name, 'download_error', is_dummy=is_dummy)
state_manager.log_event(result) state_manager.log_event(result)
results.append(result) results.append(result)

View File

@ -495,7 +495,7 @@ def main_stress_policy(args):
exec_control['workers'] = calculated_workers exec_control['workers'] = calculated_workers
logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').") logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').")
sp_utils.display_effective_policy(policy, policy_name, sources=[], original_workers_setting=original_workers_setting) sp_utils.display_effective_policy(policy, policy_name, args, sources=[], original_workers_setting=original_workers_setting)
if args.dry_run: return 0 if args.dry_run: return 0
workers = exec_control.get('workers', 1) workers = exec_control.get('workers', 1)
@ -578,15 +578,45 @@ def main_stress_policy(args):
logger.info(f"Starting/resuming from URL index {start_index + 1}.") logger.info(f"Starting/resuming from URL index {start_index + 1}.")
# The worker's get_next_url_batch will respect this starting index. # The worker's get_next_url_batch will respect this starting index.
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0 if args.dry_run: return 0
workers = exec_control.get('workers', 1) worker_pools = exec_control.get('worker_pools', [])
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: if not worker_pools and exec_control.get('workers'):
futures = [ prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
executor.submit(run_direct_batch_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
for i in range(workers)
] if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
for i in range(pool_workers):
assigned_prefix = prefixes[i % len(prefixes)]
worker_policy = deepcopy(policy)
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = assigned_prefix
worker_specs.append({
'func': run_direct_batch_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# Wait for all workers to complete. They will exit their loops when no URLs are left. # Wait for all workers to complete. They will exit their loops when no URLs are left.
concurrent.futures.wait(futures) concurrent.futures.wait(futures)
if shutdown_event.is_set(): if shutdown_event.is_set():
@ -662,14 +692,46 @@ def main_stress_policy(args):
if start_index > 0: if start_index > 0:
logger.info(f"Starting/resuming from URL index {start_index + 1}.") logger.info(f"Starting/resuming from URL index {start_index + 1}.")
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list) sp_utils.display_effective_policy(policy, policy_name, args, sources=urls_list)
if args.dry_run: return 0 if args.dry_run: return 0
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: worker_pools = exec_control.get('worker_pools', [])
futures = [ if not worker_pools and exec_control.get('workers'):
executor.submit(run_direct_docker_worker, i, policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock) prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
for i in range(workers) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
]
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_docker_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, urls_list, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# This worker runs until shutdown, like the download worker # This worker runs until shutdown, like the download worker
shutdown_event.wait() shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct docker workers to finish...") logger.info("Shutdown signal received, waiting for direct docker workers to finish...")
@ -686,14 +748,46 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1 return 1
sp_utils.display_effective_policy(policy, policy_name, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: worker_pools = exec_control.get('worker_pools', [])
futures = [ if not worker_pools and exec_control.get('workers'):
executor.submit(run_direct_docker_download_worker, i, policy, state_manager, args, profile_manager_instance, running_processes, process_lock) prefix = policy.get('download_policy', {}).get('profile_prefix')
for i in range(workers) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
]
if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_docker_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, profile_manager_instance, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
# This worker runs until shutdown # This worker runs until shutdown
shutdown_event.wait() shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct docker download workers to finish...") logger.info("Shutdown signal received, waiting for direct docker download workers to finish...")
@ -726,15 +820,46 @@ def main_stress_policy(args):
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}") logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
return 1 return 1
sp_utils.display_effective_policy(policy, policy_name, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
workers = exec_control.get('workers', 1) worker_pools = exec_control.get('worker_pools', [])
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: if not worker_pools and exec_control.get('workers'):
futures = [ prefix = policy.get('download_policy', {}).get('profile_prefix')
executor.submit(run_direct_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
for i in range(workers)
] if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_direct_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait() shutdown_event.wait()
logger.info("Shutdown signal received, waiting for direct download workers to finish...") logger.info("Shutdown signal received, waiting for direct download workers to finish...")
concurrent.futures.wait(futures) concurrent.futures.wait(futures)
@ -786,6 +911,15 @@ def main_stress_policy(args):
logger.error(f"Failed to create save directory '{save_dir}': {e}") logger.error(f"Failed to create save directory '{save_dir}': {e}")
return 1 return 1
# In dummy mode, do not populate the queue. Warn the user instead.
if args.dummy or args.dummy_batch:
input_queue = queue_policy.get('input_queue', 'queue2_auth_inbox')
logger.warning("--- Dummy Mode Notice ---")
logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.")
logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.")
logger.warning(f"You can populate it using another tool, for example: ./bin/ytops-client queue push {input_queue} --payload-json '{{\"url\":\"https://youtu.be/dQw4w9WgXcQ\"}}' --count 100")
logger.warning("-------------------------")
# Requeue failed tasks if requested # Requeue failed tasks if requested
if args.requeue_failed: if args.requeue_failed:
requeued = state_manager.requeue_failed_auth_tasks( requeued = state_manager.requeue_failed_auth_tasks(
@ -793,15 +927,46 @@ def main_stress_policy(args):
) )
logger.info(f"Requeued {requeued} failed authentication tasks.") logger.info(f"Requeued {requeued} failed authentication tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
workers = exec_control.get('workers', 1) worker_pools = exec_control.get('worker_pools', [])
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: if not worker_pools and exec_control.get('workers'):
futures = [ prefix = policy.get('info_json_generation_policy', {}).get('profile_prefix')
executor.submit(run_queue_auth_worker, i, policy, state_manager, args, auth_manager, running_processes, process_lock) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
for i in range(workers)
] if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('info_json_generation_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_queue_auth_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, auth_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait() shutdown_event.wait()
logger.info("Shutdown signal received, waiting for queue auth workers to finish...") logger.info("Shutdown signal received, waiting for queue auth workers to finish...")
concurrent.futures.wait(futures) concurrent.futures.wait(futures)
@ -843,6 +1008,17 @@ def main_stress_policy(args):
env_prefix=env_prefix env_prefix=env_prefix
) )
# In dummy mode, do not populate the queue. Warn the user instead.
if args.dummy or args.dummy_batch:
input_queue = queue_policy.get('input_queue', 'queue2_dl_inbox')
logger.warning("--- Dummy Mode Notice ---")
logger.warning(f"Dummy mode is enabled. The tool will NOT automatically populate the queue.")
logger.warning(f"Please ensure the input queue '{input_queue}' contains tasks for the workers to process.")
logger.warning("-------------------------")
# Get download policy
d_policy = policy.get('download_policy', {})
# Create output directory if specified # Create output directory if specified
output_dir = d_policy.get('output_dir') output_dir = d_policy.get('output_dir')
if output_dir: if output_dir:
@ -860,15 +1036,46 @@ def main_stress_policy(args):
) )
logger.info(f"Requeued {requeued} failed download tasks.") logger.info(f"Requeued {requeued} failed download tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
workers = exec_control.get('workers', 1) worker_pools = exec_control.get('worker_pools', [])
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: if not worker_pools and exec_control.get('workers'):
futures = [ prefix = policy.get('download_policy', {}).get('profile_prefix')
executor.submit(run_queue_download_worker, i, policy, state_manager, args, download_manager, running_processes, process_lock) worker_pools.append({'workers': exec_control.get('workers'), 'profile_prefix': prefix or 'user'})
for i in range(workers)
] if not worker_pools:
logger.error("No workers configured in policy (execution_control.workers or execution_control.worker_pools).")
return 1
if args.profile_prefix:
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' is set, it will override any prefixes defined in worker_pools.")
for pool in worker_pools:
pool['profile_prefix'] = args.profile_prefix
worker_specs = []
worker_id_counter = 0
for pool in worker_pools:
pool_workers = pool.get('workers', 1)
prefix_str = pool.get('profile_prefix', '')
prefixes = [p.strip() for p in prefix_str.split(',') if p.strip()]
if not prefixes:
logger.warning(f"Worker pool has no profile_prefix. Skipping: {pool}")
continue
# Each worker in the pool gets the full list of prefixes from the pool configuration.
for i in range(pool_workers):
worker_policy = deepcopy(policy)
# The worker functions will now handle a comma-separated list of prefixes.
worker_policy.setdefault('download_policy', {})['profile_prefix'] = prefix_str
worker_specs.append({
'func': run_queue_download_worker,
'args': (worker_id_counter, worker_policy, state_manager, args, download_manager, running_processes, process_lock)
})
worker_id_counter += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=len(worker_specs)) as executor:
futures = [executor.submit(spec['func'], *spec['args']) for spec in worker_specs]
shutdown_event.wait() shutdown_event.wait()
logger.info("Shutdown signal received, waiting for queue download workers to finish...") logger.info("Shutdown signal received, waiting for queue download workers to finish...")
concurrent.futures.wait(futures) concurrent.futures.wait(futures)
@ -949,7 +1156,7 @@ def main_stress_policy(args):
) )
logger.info(f"Requeued {requeued_auth} failed authentication tasks and {requeued_dl} failed download tasks.") logger.info(f"Requeued {requeued_auth} failed authentication tasks and {requeued_dl} failed download tasks.")
sp_utils.display_effective_policy(policy, policy_name, sources=[]) sp_utils.display_effective_policy(policy, policy_name, args, sources=[])
if args.dry_run: return 0 if args.dry_run: return 0
# Start both auth and download workers # Start both auth and download workers
@ -1127,6 +1334,7 @@ def main_stress_policy(args):
sp_utils.display_effective_policy( sp_utils.display_effective_policy(
policy, policy,
policy_name, policy_name,
args,
sources=sources, sources=sources,
profile_names=None, # Profile grouping is removed profile_names=None, # Profile grouping is removed
original_workers_setting=original_workers_setting original_workers_setting=original_workers_setting
@ -1254,12 +1462,22 @@ def main_stress_policy(args):
# If marking by rename is on, do that. # If marking by rename is on, do that.
if settings.get('mark_processed_files'): if settings.get('mark_processed_files'):
try: try:
processed_dir = settings.get('processed_files_dir')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
new_path = source.parent / f"{source.name}.{timestamp}.processed" new_filename = f"{source.name}.{timestamp}.processed"
if processed_dir:
dest_path = Path(processed_dir) / new_filename
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(source), str(dest_path))
logger.info(f"Marked '{source.name}' as processed by moving to '{dest_path}'")
else:
# Fallback to old behavior: rename in place
new_path = source.parent / new_filename
source.rename(new_path) source.rename(new_path)
logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'")
except (IOError, OSError) as e: except (IOError, OSError) as e:
logger.error(f"Failed to rename processed file '{source.name}': {e}") logger.error(f"Failed to mark processed file '{source.name}': {e}")
# When using profile-aware mode, the file processing (including marking as # When using profile-aware mode, the file processing (including marking as
# processed) is handled inside process_profile_task. # processed) is handled inside process_profile_task.