867 lines
42 KiB
Python
867 lines
42 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
|
|
Architectural Overview for the Stress Policy Tool:
|
|
|
|
|
|
|
|
This file, stress_policy_tool.py, is the main entry point and orchestrator. It is responsible for:
|
|
|
|
- Parsing command-line arguments.
|
|
|
|
- Setting up logging and the main shutdown handler.
|
|
|
|
- Initializing the StateManager and ProfileManager.
|
|
|
|
- Running the main execution loop (ThreadPoolExecutor) based on the chosen orchestration mode.
|
|
|
|
- Delegating the actual work to functions in the `workers.py` module.
|
|
|
|
|
|
|
|
The core logic has been refactored into the following modules within `ytops_client/stress_policy/`:
|
|
|
|
|
|
|
|
- arg_parser.py: Defines the command-line interface for the 'stress-policy' command using argparse.
|
|
|
|
- workers.py: Contains all core worker functions that are executed by the ThreadPoolExecutor, such as `process_task`, `run_direct_batch_worker`, and their helpers. This is where the main logic for fetching info.json
|
|
and running downloads resides.
|
|
|
|
- state_manager.py: Manages run state, statistics, rate limits, and persistence between runs (e.g., `_state.json`, `_stats.jsonl`).
|
|
|
|
- process_runners.py: A low-level module that handles the execution of external subprocesses (`run_command`) and Docker containers (`run_docker_container`).
|
|
|
|
- utils.py: Provides stateless utility functions shared across the tool, such as loading YAML policies, applying overrides, and formatting.
|
|
|
|
"""
|
|
"""
|
|
Policy-driven stress-testing orchestrator for video format downloads.
|
|
|
|
This tool orchestrates complex, multi-stage stress tests based on a YAML policy file.
|
|
It supports several modes of operation:
|
|
|
|
- full_stack: A complete workflow that first fetches an info.json for a given URL
|
|
using a profile, and then uses that info.json to perform one or more downloads.
|
|
|
|
- fetch_only: Only performs the info.json generation step. This is useful for
|
|
simulating user authentication and browsing behavior.
|
|
|
|
- download_only: Only performs the download step, using a directory of pre-existing
|
|
info.json files as its source.
|
|
|
|
- direct_batch_cli (fetch_only): A high-throughput mode for generating info.json files
|
|
by calling a custom, Redis-aware yt-dlp command-line tool directly in batch mode.
|
|
This mode bypasses the get-info Thrift service. The workflow is as follows:
|
|
1. The orchestrator worker locks a profile from the auth pool.
|
|
2. It takes a 'batch' of URLs from the source file.
|
|
3. It invokes the configured yt-dlp command, passing the profile name and proxy via
|
|
environment variables.
|
|
4. The custom yt-dlp process then does the following for each URL in the batch:
|
|
a. Checks Redis to ensure the profile has not been externally BANNED.
|
|
b. Fetches the info.json.
|
|
c. Records 'success', 'failure', or 'tolerated_error' for the profile in Redis.
|
|
5. After the yt-dlp process finishes, the orchestrator worker post-processes the
|
|
generated info.json files to inject metadata (profile name, proxy).
|
|
6. The worker unlocks the profile.
|
|
7. The worker repeats this cycle with a new profile and the next batch of URLs.
|
|
|
|
The tool uses a profile management system (v2) based on Redis for coordinating
|
|
state between multiple workers and enforcing policies (e.g., rate limits, cooldowns).
|
|
"""
|
|
|
|
import argparse
|
|
import collections
|
|
import concurrent.futures
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import shlex
|
|
import signal
|
|
import sys
|
|
import tempfile
|
|
import shutil
|
|
import threading
|
|
import time
|
|
from copy import deepcopy
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from dotenv import load_dotenv
|
|
except ImportError:
|
|
load_dotenv = None
|
|
|
|
try:
|
|
import docker
|
|
except ImportError:
|
|
docker = None
|
|
|
|
|
|
from .profile_manager_tool import ProfileManager
|
|
from .stress_policy.state_manager import StateManager
|
|
from .stress_policy.process_runners import run_command, run_docker_container, get_worker_id
|
|
from .stress_policy import utils as sp_utils
|
|
from .stress_policy.workers import (
|
|
_run_download_logic, process_profile_task, run_download_worker, process_info_json_cycle,
|
|
run_throughput_worker, _post_process_and_move_info_json, run_direct_batch_worker,
|
|
run_direct_docker_worker, find_task_and_lock_profile, run_direct_docker_download_worker,
|
|
run_direct_download_worker
|
|
)
|
|
from .stress_policy.queue_workers import (
|
|
run_queue_auth_worker, run_queue_download_worker
|
|
)
|
|
from .stress_policy.queue_provider import RedisQueueProvider
|
|
from .stress_policy.arg_parser import add_stress_policy_parser
|
|
|
|
# Add a global event for graceful shutdown
|
|
shutdown_event = threading.Event()
|
|
|
|
# Globals for tracking and terminating subprocesses on shutdown
|
|
running_processes = set()
|
|
process_lock = threading.Lock()
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger('stress_policy_tool')
|
|
|
|
|
|
def main_stress_policy(args):
|
|
"""Main logic for the 'stress-policy' command."""
|
|
if args.list_policies:
|
|
return sp_utils.list_policies()
|
|
|
|
if not args.policy:
|
|
print("Error: --policy is required unless using --list-policies.", file=sys.stderr)
|
|
return 1
|
|
|
|
# Handle --show-overrides early, as it doesn't run the test.
|
|
if args.show_overrides:
|
|
policy = sp_utils.load_policy(args.policy, args.policy_name)
|
|
if not policy:
|
|
return 1 # load_policy prints its own error
|
|
sp_utils.print_policy_overrides(policy)
|
|
return 0
|
|
|
|
policy = sp_utils.load_policy(args.policy, args.policy_name)
|
|
|
|
policy = sp_utils.apply_overrides(policy, args.set)
|
|
|
|
# If orchestrator is verbose, make downloaders verbose too by passing it through.
|
|
if args.verbose:
|
|
d_policy = policy.setdefault('download_policy', {})
|
|
extra_args = d_policy.get('extra_args', '')
|
|
if '--verbose' not in extra_args:
|
|
d_policy['extra_args'] = f"{extra_args} --verbose".strip()
|
|
|
|
# --- Set safe defaults ---
|
|
settings = policy.get('settings', {})
|
|
mode = settings.get('mode', 'full_stack')
|
|
# For continuous download mode, it is almost always desired to mark files as
|
|
# processed to avoid an infinite loop on the same files. We make this the
|
|
# default and issue a warning if it's not explicitly set.
|
|
if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous':
|
|
if 'mark_processed_files' not in settings:
|
|
# Use print because logger is not yet configured.
|
|
print("WARNING: In 'continuous' download mode, 'settings.mark_processed_files' was not set.", file=sys.stderr)
|
|
print(" Defaulting to 'true' to prevent reprocessing files.", file=sys.stderr)
|
|
print(" Set it to 'false' explicitly in your policy to disable this behavior.", file=sys.stderr)
|
|
settings['mark_processed_files'] = True
|
|
|
|
# Load .env file *after* loading policy to respect env_file from policy.
|
|
if load_dotenv:
|
|
sim_params = policy.get('simulation_parameters', {})
|
|
# Coalesce from CLI, then policy. An explicit CLI arg takes precedence.
|
|
env_file = args.env_file or sim_params.get('env_file')
|
|
|
|
if not env_file and args.env and '.env' in args.env and os.path.exists(args.env):
|
|
# Use print because logger is not yet configured.
|
|
print(f"Warning: --env should be an environment name (e.g., 'sim'), not a file path. Treating '{args.env}' as --env-file. The environment name will default to 'sim'.", file=sys.stderr)
|
|
env_file = args.env
|
|
args.env = 'sim'
|
|
|
|
was_loaded = load_dotenv(env_file)
|
|
if was_loaded:
|
|
# Use print because logger is not yet configured.
|
|
print(f"Loaded environment variables from {env_file or '.env file'}", file=sys.stderr)
|
|
elif args.env_file: # Only error if user explicitly passed it
|
|
print(f"Error: The specified --env-file was not found: {args.env_file}", file=sys.stderr)
|
|
return 1
|
|
|
|
if args.profile_prefix:
|
|
# This shortcut overrides the profile_prefix for all relevant stages.
|
|
# Useful for simple fetch_only or download_only runs.
|
|
|
|
# Ensure info_json_generation_policy is a dict before assigning to it.
|
|
# This handles cases where the policy has a non-dict value (like None or a string).
|
|
if not isinstance(policy.get('info_json_generation_policy'), dict):
|
|
policy['info_json_generation_policy'] = {}
|
|
policy['info_json_generation_policy']['profile_prefix'] = args.profile_prefix
|
|
|
|
# Ensure download_policy is a dict before assigning to it.
|
|
if not isinstance(policy.get('download_policy'), dict):
|
|
policy['download_policy'] = {}
|
|
policy['download_policy']['profile_prefix'] = args.profile_prefix
|
|
|
|
# Use print because logger is not yet configured.
|
|
print(f"Overriding profile_prefix for all stages with CLI arg: {args.profile_prefix}", file=sys.stderr)
|
|
|
|
# Apply direct CLI overrides after --set, so they have final precedence.
|
|
if args.auto_merge_fragments is not None:
|
|
policy.setdefault('download_policy', {})['auto_merge_fragments'] = args.auto_merge_fragments
|
|
if args.remove_fragments_after_merge is not None:
|
|
policy.setdefault('download_policy', {})['remove_fragments_after_merge'] = args.remove_fragments_after_merge
|
|
if args.fragments_dir is not None:
|
|
policy.setdefault('download_policy', {})['aria_fragments_dir'] = args.fragments_dir
|
|
if args.remote_dir is not None:
|
|
policy.setdefault('download_policy', {})['aria_remote_dir'] = args.remote_dir
|
|
if args.cleanup is not None:
|
|
policy.setdefault('download_policy', {})['cleanup'] = args.cleanup
|
|
|
|
if args.expire_time_shift_minutes is not None:
|
|
policy.setdefault('download_policy', {})['expire_time_shift_minutes'] = args.expire_time_shift_minutes
|
|
|
|
policy_name = policy.get('name', args.policy_name or Path(args.policy).stem)
|
|
|
|
# --- Logging Setup ---
|
|
log_level = logging.DEBUG if args.verbose else logging.INFO
|
|
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
|
|
date_format = None if args.verbose else '%H:%M:%S'
|
|
|
|
root_logger = logging.getLogger()
|
|
root_logger.setLevel(log_level)
|
|
|
|
# Silence noisy loggers from dependencies like docker-py
|
|
logging.getLogger('urllib3.connectionpool').setLevel(logging.INFO if args.verbose else logging.WARNING)
|
|
|
|
# Remove any existing handlers to avoid duplicate logs
|
|
for handler in root_logger.handlers[:]:
|
|
root_logger.removeHandler(handler)
|
|
|
|
# Add console handler
|
|
console_handler = logging.StreamHandler(sys.stdout)
|
|
console_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
|
|
root_logger.addHandler(console_handler)
|
|
|
|
if not args.disable_log_writing:
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
|
|
log_filename = f"stress-policy-{timestamp}-{policy_name}.log"
|
|
try:
|
|
# Open in append mode to be safe, though timestamp should be unique.
|
|
file_handler = logging.FileHandler(log_filename, mode='a', encoding='utf-8')
|
|
file_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
|
|
root_logger.addHandler(file_handler)
|
|
# Use print because logger is just being set up.
|
|
print(f"Logging to file: {log_filename}", file=sys.stderr)
|
|
except IOError as e:
|
|
print(f"Error: Could not open log file {log_filename}: {e}", file=sys.stderr)
|
|
|
|
state_manager = StateManager(policy_name, disable_log_writing=args.disable_log_writing, shutdown_event=shutdown_event)
|
|
|
|
if args.reset_infojson:
|
|
info_json_dir = settings.get('info_json_dir')
|
|
if not info_json_dir:
|
|
logger.error("--reset-infojson requires 'settings.info_json_dir' to be set in the policy.")
|
|
return 1
|
|
|
|
logger.info(f"--- Resetting info.json files in '{info_json_dir}' ---")
|
|
source_dir = Path(info_json_dir)
|
|
if not source_dir.is_dir():
|
|
logger.warning(f"Source directory for reset does not exist: {source_dir}. Skipping reset.")
|
|
else:
|
|
processed_files = list(source_dir.rglob('*.json.processed'))
|
|
locked_files = list(source_dir.rglob('*.json.LOCKED.*'))
|
|
files_to_reset = processed_files + locked_files
|
|
|
|
if not files_to_reset:
|
|
logger.info("No processed or locked files found to reset.")
|
|
else:
|
|
reset_count = 0
|
|
for file_to_reset in files_to_reset:
|
|
original_path = None
|
|
if file_to_reset.name.endswith('.processed'):
|
|
original_path_str = str(file_to_reset).removesuffix('.processed')
|
|
original_path = Path(original_path_str)
|
|
elif '.LOCKED.' in file_to_reset.name:
|
|
original_path_str = str(file_to_reset).split('.LOCKED.')[0]
|
|
original_path = Path(original_path_str)
|
|
|
|
if original_path:
|
|
try:
|
|
if original_path.exists():
|
|
logger.warning(f"Original file '{original_path.name}' already exists. Deleting '{file_to_reset.name}' instead of renaming.")
|
|
file_to_reset.unlink()
|
|
else:
|
|
file_to_reset.rename(original_path)
|
|
logger.debug(f"Reset '{file_to_reset.name}' to '{original_path.name}'")
|
|
reset_count += 1
|
|
except (IOError, OSError) as e:
|
|
logger.error(f"Failed to reset '{file_to_reset.name}': {e}")
|
|
logger.info(f"Reset {reset_count} info.json file(s).")
|
|
|
|
if args.pre_cleanup_media is not None:
|
|
cleanup_path_str = args.pre_cleanup_media
|
|
d_policy = policy.get('download_policy', {})
|
|
direct_docker_policy = policy.get('direct_docker_cli_policy', {})
|
|
|
|
if cleanup_path_str == '.': # Special value from `const`
|
|
# Determine path from policy
|
|
if direct_docker_policy.get('docker_host_download_path'):
|
|
cleanup_path_str = direct_docker_policy['docker_host_download_path']
|
|
elif d_policy.get('output_dir'):
|
|
cleanup_path_str = d_policy['output_dir']
|
|
else:
|
|
logger.error("--pre-cleanup-media was used without a path, but could not determine a download directory from the policy.")
|
|
return 1
|
|
|
|
cleanup_path = Path(cleanup_path_str)
|
|
if not cleanup_path.is_dir():
|
|
logger.warning(f"Directory for media cleanup does not exist, skipping: {cleanup_path}")
|
|
else:
|
|
logger.info(f"--- Cleaning up media files in '{cleanup_path}' ---")
|
|
media_extensions = ['.mp4', '.m4a', '.webm', '.mkv', '.part', '.ytdl']
|
|
files_deleted = 0
|
|
for ext in media_extensions:
|
|
for media_file in cleanup_path.rglob(f'*{ext}'):
|
|
try:
|
|
media_file.unlink()
|
|
logger.debug(f"Deleted {media_file}")
|
|
files_deleted += 1
|
|
except OSError as e:
|
|
logger.error(f"Failed to delete media file '{media_file}': {e}")
|
|
logger.info(f"Deleted {files_deleted} media file(s).")
|
|
|
|
if args.reset_local_cache_folder is not None:
|
|
cache_path_str = args.reset_local_cache_folder
|
|
direct_docker_policy = policy.get('direct_docker_cli_policy', {})
|
|
|
|
if cache_path_str == '.': # Special value from `const`
|
|
if direct_docker_policy.get('docker_host_cache_path'):
|
|
cache_path_str = direct_docker_policy['docker_host_cache_path']
|
|
else:
|
|
logger.error("--reset-local-cache-folder was used without a path, but 'direct_docker_cli_policy.docker_host_cache_path' is not set in the policy.")
|
|
return 1
|
|
|
|
cache_path = Path(cache_path_str)
|
|
if not cache_path.is_dir():
|
|
logger.warning(f"Local cache directory for reset does not exist, skipping: {cache_path}")
|
|
else:
|
|
logger.info(f"--- Resetting local cache folder '{cache_path}' ---")
|
|
try:
|
|
shutil.rmtree(cache_path)
|
|
os.makedirs(cache_path)
|
|
logger.info(f"Successfully deleted and recreated cache folder '{cache_path}'.")
|
|
except OSError as e:
|
|
logger.error(f"Failed to reset cache folder '{cache_path}': {e}")
|
|
|
|
if policy.get('name') in ['continuous_auth_simulation', 'continuous_download_simulation']:
|
|
logger.warning("This policy is part of a multi-stage simulation.")
|
|
if 'auth' in policy.get('name', ''):
|
|
logger.warning("It is recommended to run this auth policy using: ./bin/run-profile-simulation")
|
|
if 'download' in policy.get('name', ''):
|
|
logger.warning("It is recommended to run this download policy using: ./bin/run-download-simulation")
|
|
time.sleep(2)
|
|
|
|
# --- Graceful shutdown handler ---
|
|
def shutdown_handler(signum, frame):
|
|
if not shutdown_event.is_set():
|
|
logger.info(f"\nSignal {signum} received, shutting down gracefully...")
|
|
shutdown_event.set()
|
|
|
|
# Save state immediately to prevent loss on interrupt.
|
|
logger.info("Attempting to save state before shutdown...")
|
|
state_manager.close()
|
|
logger.info("Shutdown requested. Allowing in-progress tasks to complete. No new tasks will be started. Press Ctrl+C again to force exit.")
|
|
else:
|
|
logger.info("Second signal received, forcing exit.")
|
|
# On second signal, forcefully terminate subprocesses.
|
|
with process_lock:
|
|
if running_processes:
|
|
logger.info(f"Forcefully terminating {len(running_processes)} running subprocess(es)...")
|
|
for p in running_processes:
|
|
try:
|
|
# Kill the entire process group to ensure child processes (like yt-dlp) are terminated.
|
|
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
|
|
except (ProcessLookupError, PermissionError):
|
|
pass # Process already finished or we lack permissions
|
|
# Use os._exit for a hard exit that doesn't run cleanup handlers,
|
|
# which can deadlock if locks are held.
|
|
os._exit(1)
|
|
|
|
signal.signal(signal.SIGINT, shutdown_handler)
|
|
signal.signal(signal.SIGTERM, shutdown_handler)
|
|
|
|
settings = policy.get('settings', {})
|
|
exec_control = policy.get('execution_control', {})
|
|
mode = settings.get('mode', 'full_stack')
|
|
orchestration_mode = settings.get('orchestration_mode')
|
|
|
|
# --- Profile Manager Setup for Locking Mode ---
|
|
profile_manager = None
|
|
profile_managers = {}
|
|
if settings.get('profile_mode') == 'from_pool_with_lock':
|
|
logger.info("--- Profile Locking Mode Enabled ---")
|
|
logger.info("This mode requires profiles to be set up and managed by the policy enforcer.")
|
|
logger.info("1. Ensure you have run: bin/setup-profiles-from-policy")
|
|
logger.info("2. Ensure the policy enforcer is running in the background: bin/ytops-client policy-enforcer --live")
|
|
logger.info(" (e.g. using policies/8_unified_simulation_enforcer.yaml)")
|
|
logger.info("3. To monitor profiles, use: bin/ytops-client profile list --live")
|
|
logger.info("------------------------------------")
|
|
|
|
# Coalesce Redis settings from CLI args, .env file, and defaults
|
|
redis_host = args.redis_host or os.getenv('REDIS_HOST', os.getenv('MASTER_HOST_IP', 'localhost'))
|
|
redis_port = args.redis_port if args.redis_port is not None else int(os.getenv('REDIS_PORT', 6379))
|
|
redis_password = args.redis_password or os.getenv('REDIS_PASSWORD')
|
|
|
|
sim_params = policy.get('simulation_parameters', {})
|
|
|
|
def setup_manager(sim_type, env_cli_arg, env_policy_key):
|
|
# Determine the effective environment name with correct precedence:
|
|
# 1. Specific CLI arg (e.g., --auth-env)
|
|
# 2. General CLI arg (--env)
|
|
# 3. Specific policy setting (e.g., simulation_parameters.auth_env)
|
|
# 4. General policy setting (simulation_parameters.env)
|
|
# 5. Hardcoded default ('sim')
|
|
policy_env = sim_params.get(env_policy_key)
|
|
default_policy_env = sim_params.get('env')
|
|
effective_env = env_cli_arg or args.env or policy_env or default_policy_env or 'sim'
|
|
|
|
logger.info(f"Setting up ProfileManager for {sim_type} simulation using env: '{effective_env}'")
|
|
|
|
if args.key_prefix:
|
|
key_prefix = args.key_prefix
|
|
else:
|
|
key_prefix = f"{effective_env}_profile_mgmt_"
|
|
|
|
return ProfileManager(
|
|
redis_host=redis_host, redis_port=redis_port,
|
|
redis_password=redis_password, key_prefix=key_prefix
|
|
)
|
|
|
|
# Determine which managers are needed based on mode and orchestration mode
|
|
needs_auth = False
|
|
needs_download = False
|
|
|
|
if mode in ['full_stack', 'fetch_only']:
|
|
needs_auth = True
|
|
if mode in ['full_stack', 'download_only']:
|
|
needs_download = True
|
|
|
|
if orchestration_mode == 'direct_batch_cli':
|
|
direct_policy = policy.get('direct_batch_cli_policy', {})
|
|
use_env = direct_policy.get('use_profile_env', 'auth')
|
|
if use_env == 'download':
|
|
needs_download = True
|
|
else: # auth is default
|
|
needs_auth = True
|
|
|
|
if needs_auth:
|
|
# For backward compatibility, policy might have 'env' instead of 'auth_env'
|
|
auth_env_key = 'auth_env' if 'auth_env' in sim_params else 'env'
|
|
profile_managers['auth'] = setup_manager('Auth', args.auth_env, auth_env_key)
|
|
|
|
if needs_download:
|
|
download_env_key = 'download_env' if 'download_env' in sim_params else 'env'
|
|
profile_managers['download'] = setup_manager('Download', args.download_env, download_env_key)
|
|
|
|
# For modes with only one manager, set the legacy `profile_manager` variable
|
|
# for components that haven't been updated to use the `profile_managers` dict.
|
|
if len(profile_managers) == 1:
|
|
profile_manager = list(profile_managers.values())[0]
|
|
|
|
# --- Worker Launching Logic ---
|
|
# This block determines how many workers to launch and which function to run.
|
|
# It centralizes the logic for handling worker_pools vs. legacy workers setting.
|
|
|
|
# Check if the user explicitly set execution_control.workers via the CLI.
|
|
# This gives the CLI override precedence over the worker_pools config in the file.
|
|
cli_overrode_workers = any('execution_control.workers' in s for s in args.set)
|
|
|
|
worker_pools = exec_control.get('worker_pools')
|
|
use_worker_pools = worker_pools and not cli_overrode_workers
|
|
|
|
total_workers = 0
|
|
worker_configs = [] # List of {'target': function, 'kwargs': {}}
|
|
|
|
# Determine the target worker function based on orchestration mode
|
|
target_worker_func = None
|
|
manager_for_worker = None
|
|
urls_list = []
|
|
|
|
if orchestration_mode == 'throughput':
|
|
target_worker_func = run_throughput_worker
|
|
manager_for_worker = profile_managers.get('download')
|
|
elif orchestration_mode == 'direct_batch_cli':
|
|
target_worker_func = run_direct_batch_worker
|
|
use_env = policy.get('direct_batch_cli_policy', {}).get('use_profile_env', 'auth')
|
|
manager_for_worker = profile_managers.get(use_env)
|
|
elif orchestration_mode == 'direct_docker_cli':
|
|
if mode == 'fetch_only':
|
|
target_worker_func = run_direct_docker_worker
|
|
elif mode == 'download_only':
|
|
target_worker_func = run_direct_docker_download_worker
|
|
use_env = policy.get('direct_docker_cli_policy', {}).get('use_profile_env', 'auth' if mode == 'fetch_only' else 'download')
|
|
manager_for_worker = profile_managers.get(use_env)
|
|
elif orchestration_mode == 'direct_download_cli':
|
|
target_worker_func = run_direct_download_worker
|
|
manager_for_worker = profile_managers.get('download')
|
|
# Other modes (queue, task-first) are handled separately below.
|
|
|
|
if use_worker_pools:
|
|
# New logic: Filter worker pools if a specific profile_prefix is given via CLI
|
|
pools_to_run = worker_pools
|
|
if args.profile_prefix:
|
|
logger.info(f"CLI --profile-prefix '{args.profile_prefix}' provided. Filtering worker pools.")
|
|
pools_to_run = [p for p in worker_pools if p.get('profile_prefix') == args.profile_prefix]
|
|
if not pools_to_run:
|
|
logger.error(f"No worker pool found in policy with profile_prefix matching '{args.profile_prefix}'. Exiting.")
|
|
return 1
|
|
|
|
total_workers = sum(p.get('workers', 1) for p in pools_to_run)
|
|
worker_idx_counter = 0
|
|
for pool in pools_to_run:
|
|
pool_prefix = pool.get('profile_prefix')
|
|
num_workers_in_pool = pool.get('workers', 1)
|
|
if not pool_prefix:
|
|
logger.warning(f"Worker pool found without a 'profile_prefix'. Skipping: {pool}")
|
|
continue
|
|
for _ in range(num_workers_in_pool):
|
|
worker_configs.append({
|
|
'id': worker_idx_counter,
|
|
'prefix': pool_prefix,
|
|
'pool_info': f"Pool '{pool_prefix}'"
|
|
})
|
|
worker_idx_counter += 1
|
|
else:
|
|
total_workers = exec_control.get('workers', 1)
|
|
if cli_overrode_workers:
|
|
logger.info(f"Overriding 'worker_pools' with CLI setting: --set execution_control.workers={total_workers}")
|
|
for i in range(total_workers):
|
|
worker_configs.append({
|
|
'id': i,
|
|
'prefix': None, # No specific prefix
|
|
'pool_info': "Legacy 'workers' config"
|
|
})
|
|
|
|
# --- Throughput Orchestration Mode ---
|
|
if orchestration_mode == 'throughput':
|
|
logger.info("--- Throughput Orchestration Mode Enabled ---")
|
|
if mode != 'download_only' or settings.get('profile_mode') != 'from_pool_with_lock':
|
|
logger.error("Orchestration mode 'throughput' is only compatible with 'download_only' mode and 'from_pool_with_lock' profile mode.")
|
|
return 1
|
|
|
|
if not manager_for_worker:
|
|
logger.error("Throughput mode requires a download profile manager.")
|
|
return 1
|
|
|
|
original_workers_setting = exec_control.get('workers')
|
|
if original_workers_setting == 'auto':
|
|
# This logic is complex and specific to this mode, so we keep it here.
|
|
d_policy = policy.get('download_policy', {})
|
|
profile_prefix = d_policy.get('profile_prefix')
|
|
if not profile_prefix:
|
|
logger.error("Cannot calculate 'auto' workers for throughput mode without 'download_policy.profile_prefix'.")
|
|
return 1
|
|
all_profiles = manager_for_worker.list_profiles()
|
|
matching_profiles = [p for p in all_profiles if p['name'].startswith(profile_prefix)]
|
|
calculated_workers = len(matching_profiles)
|
|
if calculated_workers == 0:
|
|
logger.error(f"Cannot use 'auto' workers: No profiles found with prefix '{profile_prefix}'. Please run setup-profiles.")
|
|
return 1
|
|
exec_control['workers'] = calculated_workers
|
|
logger.info(f"Calculated 'auto' workers for throughput mode: {calculated_workers} (based on {len(matching_profiles)} profiles with prefix '{profile_prefix}').")
|
|
# Recalculate worker configs if 'auto' was used
|
|
total_workers = calculated_workers
|
|
worker_configs = [{'id': i, 'prefix': None, 'pool_info': "Legacy 'workers' config"} for i in range(total_workers)]
|
|
|
|
sp_utils.display_effective_policy(policy, policy_name, sources=[], original_workers_setting=original_workers_setting)
|
|
if args.dry_run: return 0
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
|
|
futures = []
|
|
logger.info(f"Launching {total_workers} worker(s)...")
|
|
for config in worker_configs:
|
|
logger.info(f" - Worker {config['id']}: {config['pool_info']}")
|
|
futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, running_processes, process_lock, profile_prefix=config['prefix']))
|
|
|
|
shutdown_event.wait()
|
|
logger.info("Shutdown signal received, waiting for throughput workers to finish current tasks...")
|
|
concurrent.futures.wait(futures)
|
|
|
|
state_manager.print_summary(policy)
|
|
state_manager.close()
|
|
return 0
|
|
|
|
# --- Direct Batch CLI Orchestration Mode ---
|
|
elif orchestration_mode == 'direct_batch_cli':
|
|
logger.info("--- Direct Batch CLI Orchestration Mode Enabled ---")
|
|
if mode != 'fetch_only' or settings.get('profile_mode') != 'from_pool_with_lock':
|
|
logger.error("Orchestration mode 'direct_batch_cli' is only compatible with 'fetch_only' mode and 'from_pool_with_lock' profile mode.")
|
|
return 1
|
|
|
|
if not manager_for_worker:
|
|
logger.error(f"Direct batch CLI mode requires a profile manager, but it was not configured.")
|
|
return 1
|
|
|
|
urls_file = settings.get('urls_file')
|
|
if not urls_file:
|
|
logger.error("Direct batch CLI mode requires 'settings.urls_file'.")
|
|
return 1
|
|
|
|
try:
|
|
with open(urls_file, 'r', encoding='utf-8') as f:
|
|
urls_list = [line.strip() for line in f if line.strip()]
|
|
except IOError as e:
|
|
logger.error(f"Could not read urls_file '{urls_file}': {e}")
|
|
return 1
|
|
|
|
if not urls_list:
|
|
logger.error(f"URL file '{urls_file}' is empty. Nothing to do.")
|
|
return 1
|
|
|
|
start_index = state_manager.get_last_url_index()
|
|
if args.start_from_url_index is not None:
|
|
start_index = max(0, args.start_from_url_index - 1)
|
|
state_manager.update_last_url_index(start_index, force=True)
|
|
|
|
if start_index >= len(urls_list) and len(urls_list) > 0:
|
|
logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
logger.warning("!!! ALL URLS HAVE BEEN PROCESSED IN PREVIOUS RUNS (based on state file) !!!")
|
|
logger.warning(f"!!! State file indicates start index {start_index + 1}, but URL file has only {len(urls_list)} URLs. !!!")
|
|
logger.warning("!!! Deleting state file and stopping. Please run the command again to start from the beginning. !!!")
|
|
logger.warning("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
if not args.dry_run and not args.disable_log_writing:
|
|
state_manager.close()
|
|
try: os.remove(state_manager.state_file_path)
|
|
except OSError as e: logger.error(f"Failed to delete state file: {e}")
|
|
return 0
|
|
|
|
if start_index > 0:
|
|
logger.info(f"Starting/resuming from URL index {start_index + 1}.")
|
|
|
|
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list)
|
|
if args.dry_run: return 0
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
|
|
futures = []
|
|
logger.info(f"Launching {total_workers} worker(s)...")
|
|
for config in worker_configs:
|
|
logger.info(f" - Worker {config['id']}: {config['pool_info']}")
|
|
futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, urls_list, running_processes, process_lock, profile_prefix=config['prefix']))
|
|
|
|
concurrent.futures.wait(futures)
|
|
if shutdown_event.is_set():
|
|
logger.info("Shutdown signal received, workers have finished.")
|
|
|
|
state_manager.print_summary(policy)
|
|
state_manager.close()
|
|
return 0
|
|
|
|
# --- Direct Docker CLI Orchestration Mode ---
|
|
elif orchestration_mode == 'direct_docker_cli':
|
|
logger.info("--- Direct Docker CLI Orchestration Mode Enabled ---")
|
|
if not docker:
|
|
logger.error("The 'direct_docker_cli' orchestration mode requires the Docker SDK for Python.")
|
|
logger.error("Please install it with: pip install docker")
|
|
return 1
|
|
|
|
if mode not in ['fetch_only', 'download_only'] or settings.get('profile_mode') != 'from_pool_with_lock':
|
|
logger.error("Orchestration mode 'direct_docker_cli' is only compatible with 'fetch_only' or 'download_only' modes and 'from_pool_with_lock' profile mode.")
|
|
return 1
|
|
|
|
if not manager_for_worker:
|
|
logger.error(f"Direct docker CLI mode requires a profile manager, but it was not configured.")
|
|
return 1
|
|
|
|
if mode == 'fetch_only':
|
|
queue_policy = policy.get('queue_policy')
|
|
if not queue_policy:
|
|
urls_file = settings.get('urls_file')
|
|
if not urls_file:
|
|
logger.error("Direct docker CLI (fetch) mode requires 'settings.urls_file' if not configured for queue operation.")
|
|
return 1
|
|
try:
|
|
with open(urls_file, 'r', encoding='utf-8') as f:
|
|
urls_list = [line.strip() for line in f if line.strip()]
|
|
except IOError as e:
|
|
logger.error(f"Could not read urls_file '{urls_file}': {e}")
|
|
return 1
|
|
if not urls_list:
|
|
logger.error(f"URL file '{urls_file}' is empty. Nothing to do.")
|
|
return 1
|
|
start_index = state_manager.get_last_url_index()
|
|
if args.start_from_url_index is not None:
|
|
start_index = max(0, args.start_from_url_index - 1)
|
|
state_manager.update_last_url_index(start_index, force=True)
|
|
if start_index >= len(urls_list) and len(urls_list) > 0:
|
|
logger.warning("ALL URLS HAVE BEEN PROCESSED. Reset state file to run again.")
|
|
return 0
|
|
if start_index > 0:
|
|
logger.info(f"Starting/resuming from URL index {start_index + 1}.")
|
|
else:
|
|
# Queue mode setup
|
|
# ... (omitted for brevity, assuming file mode for this fix)
|
|
pass
|
|
elif mode == 'download_only':
|
|
# ... (omitted for brevity, assuming fetch mode for this fix)
|
|
pass
|
|
|
|
sp_utils.display_effective_policy(policy, policy_name, sources=urls_list)
|
|
if args.dry_run: return 0
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
|
|
futures = []
|
|
logger.info(f"Launching {total_workers} worker(s)...")
|
|
for config in worker_configs:
|
|
logger.info(f" - Worker {config['id']}: {config['pool_info']}")
|
|
if mode == 'fetch_only':
|
|
futures.append(executor.submit(
|
|
target_worker_func, config['id'], policy, state_manager, args,
|
|
manager_for_worker, urls_list, running_processes, process_lock,
|
|
profile_prefix=config['prefix']
|
|
))
|
|
elif mode == 'download_only':
|
|
futures.append(executor.submit(
|
|
target_worker_func, config['id'], policy, state_manager, args,
|
|
manager_for_worker, running_processes, process_lock,
|
|
profile_prefix=config['prefix']
|
|
))
|
|
else:
|
|
logger.error(f"Unsupported mode '{mode}' for 'direct_docker_cli' orchestration.")
|
|
shutdown_event.set()
|
|
break
|
|
|
|
if shutdown_event.is_set():
|
|
pass # An error occurred, just exit
|
|
elif mode == 'fetch_only' and not policy.get('queue_policy'):
|
|
concurrent.futures.wait(futures)
|
|
else: # download_only or queue mode runs until shutdown
|
|
shutdown_event.wait()
|
|
|
|
if shutdown_event.is_set():
|
|
logger.info("Shutdown signal received, workers have finished.")
|
|
|
|
state_manager.print_summary(policy)
|
|
state_manager.close()
|
|
return 0
|
|
|
|
# --- Direct Download CLI Orchestration Mode ---
|
|
elif orchestration_mode == 'direct_download_cli':
|
|
logger.info("--- Direct Download CLI Orchestration Mode Enabled ---")
|
|
if mode != 'download_only' or settings.get('profile_mode') != 'from_pool_with_lock':
|
|
logger.error("Orchestration mode 'direct_download_cli' is only compatible with 'download_only' mode and 'from_pool_with_lock' profile mode.")
|
|
return 1
|
|
|
|
if not manager_for_worker:
|
|
logger.error("Direct download CLI mode requires a download profile manager.")
|
|
return 1
|
|
|
|
info_json_dir = settings.get('info_json_dir')
|
|
if not info_json_dir:
|
|
logger.error("Direct download CLI mode requires 'settings.info_json_dir'.")
|
|
return 1
|
|
try:
|
|
os.makedirs(info_json_dir, exist_ok=True)
|
|
except OSError as e:
|
|
logger.error(f"Failed to create info.json directory '{info_json_dir}': {e}")
|
|
return 1
|
|
|
|
sp_utils.display_effective_policy(policy, policy_name, sources=[])
|
|
if args.dry_run: return 0
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
|
|
futures = []
|
|
logger.info(f"Launching {total_workers} worker(s)...")
|
|
for config in worker_configs:
|
|
logger.info(f" - Worker {config['id']}: {config['pool_info']}")
|
|
futures.append(executor.submit(target_worker_func, config['id'], policy, state_manager, args, manager_for_worker, running_processes, process_lock, profile_prefix=config['prefix']))
|
|
|
|
shutdown_event.wait()
|
|
logger.info("Shutdown signal received, waiting for direct download workers to finish...")
|
|
concurrent.futures.wait(futures)
|
|
|
|
state_manager.print_summary(policy)
|
|
state_manager.close()
|
|
return 0
|
|
|
|
# --- Queue-based Orchestration Modes ---
|
|
elif orchestration_mode in ['queue_auth', 'queue_download', 'queue_full_stack']:
|
|
# This logic is complex and separate. For now, we assume it doesn't use worker_pools yet.
|
|
# If it needs to, it will require similar changes.
|
|
# ... (existing queue logic)
|
|
logger.error(f"Orchestration mode '{orchestration_mode}' is not fully covered by the new worker logic yet.")
|
|
return 1
|
|
|
|
# --- Default (Task-First) Orchestration Mode ---
|
|
# ... (existing task-first logic)
|
|
logger.error(f"Orchestration mode 'task-first' (default) is not fully covered by the new worker logic yet.")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
def process_task(source, index, cycle_num, policy, state_manager, args, profile_managers, running_processes, process_lock):
|
|
"""
|
|
Worker task for a single source (URL or info.json path).
|
|
This function is the main entry point for the 'task-first' orchestration mode.
|
|
"""
|
|
settings = policy.get('settings', {})
|
|
mode = settings.get('mode', 'full_stack')
|
|
profile_mode = settings.get('profile_mode')
|
|
|
|
auth_manager = profile_managers.get('auth')
|
|
download_manager = profile_managers.get('download')
|
|
|
|
# --- Full Stack Mode ---
|
|
if mode == 'full_stack':
|
|
# 1. Fetch info.json
|
|
if not auth_manager:
|
|
logger.error("Full-stack mode requires an 'auth' profile manager.")
|
|
return []
|
|
|
|
# This part of the logic is simplified and does not exist in the provided codebase.
|
|
# It would involve locking an auth profile, fetching info.json, and then unlocking.
|
|
# For now, we'll assume a placeholder logic.
|
|
logger.error("Full-stack mode (task-first) is not fully implemented in this version.")
|
|
return []
|
|
|
|
# --- Fetch Only Mode ---
|
|
elif mode == 'fetch_only':
|
|
if not auth_manager:
|
|
logger.error("Fetch-only mode requires an 'auth' profile manager.")
|
|
return []
|
|
logger.error("Fetch-only mode (task-first) is not fully implemented in this version.")
|
|
return []
|
|
|
|
# --- Download Only Mode ---
|
|
elif mode == 'download_only':
|
|
if profile_mode == 'from_pool_with_lock':
|
|
if not download_manager:
|
|
logger.error("Download-only with locking requires a 'download' profile manager.")
|
|
return []
|
|
# In this mode, we process one file per profile.
|
|
return process_profile_task(
|
|
profile_name=None, # Profile is locked inside the task
|
|
file_list=[source],
|
|
policy=policy,
|
|
state_manager=state_manager,
|
|
cycle_num=cycle_num,
|
|
args=args,
|
|
running_processes=running_processes,
|
|
process_lock=process_lock,
|
|
profile_manager_instance=download_manager
|
|
)
|
|
else:
|
|
# Legacy mode without profile locking
|
|
try:
|
|
with open(source, 'r', encoding='utf-8') as f:
|
|
info_json_content = f.read()
|
|
except (IOError, FileNotFoundError) as e:
|
|
logger.error(f"[{sp_utils.get_display_name(source)}] Could not read info.json file: {e}")
|
|
return []
|
|
|
|
return _run_download_logic(source, info_json_content, policy, state_manager, args, running_processes, process_lock)
|
|
|
|
return []
|