yt-dlp-dags/ytops_client/stress_policy_tool.py

2421 lines
124 KiB
Python

#!/usr/bin/env python3
"""
Policy-driven stress-testing orchestrator for video format downloads.
"""
import argparse
import collections
import collections.abc
import concurrent.futures
import json
import logging
import os
import random
import re
import shlex
import signal
import subprocess
import sys
import threading
import time
from copy import deepcopy
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse, parse_qs
try:
import yaml
except ImportError:
print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
sys.exit(1)
# Add a global event for graceful shutdown
shutdown_event = threading.Event()
# Globals for tracking and terminating subprocesses on shutdown
running_processes = set()
process_lock = threading.Lock()
# Globals for assigning a stable ID to each worker thread
worker_id_map = {}
worker_id_counter = 0
worker_id_lock = threading.Lock()
# Configure logging
logger = logging.getLogger('stress_policy_tool')
def get_worker_id():
"""Assigns a stable, sequential ID to each worker thread."""
global worker_id_counter
thread_id = threading.get_ident()
with worker_id_lock:
if thread_id not in worker_id_map:
worker_id_map[thread_id] = worker_id_counter
worker_id_counter += 1
return worker_id_map[thread_id]
def get_video_id(url: str) -> str:
"""Extracts a YouTube video ID from a URL."""
match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
return url
return "unknown_video_id"
def get_display_name(path_or_url):
"""Returns a clean name for logging, either a filename or a video ID."""
if isinstance(path_or_url, Path):
return path_or_url.name
path_str = str(path_or_url)
video_id = get_video_id(path_str)
if video_id != "unknown_video_id":
return video_id
return Path(path_str).name
def format_size(b):
"""Format size in bytes to human-readable string."""
if b is None:
return 'N/A'
if b < 1024:
return f"{b}B"
elif b < 1024**2:
return f"{b/1024:.2f}KiB"
elif b < 1024**3:
return f"{b/1024**2:.2f}MiB"
else:
return f"{b/1024**3:.2f}GiB"
def flatten_dict(d, parent_key='', sep='.'):
"""Flattens a nested dictionary."""
items = {}
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.abc.MutableMapping):
items.update(flatten_dict(v, new_key, sep=sep))
else:
items[new_key] = v
return items
def print_policy_overrides(policy):
"""Prints all policy values as a single-line of --set arguments."""
# We don't want to include the 'name' key in the overrides.
policy_copy = deepcopy(policy)
policy_copy.pop('name', None)
flat_policy = flatten_dict(policy_copy)
set_args = []
for key, value in sorted(flat_policy.items()):
if value is None:
value_str = 'null'
elif isinstance(value, bool):
value_str = str(value).lower()
elif isinstance(value, (list, dict)):
# Use compact JSON for lists/dicts
value_str = json.dumps(value, separators=(',', ':'))
else:
value_str = str(value)
# Use shlex.quote to handle spaces and special characters safely
set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}")
print(' '.join(set_args))
def get_profile_from_filename(path, regex_pattern):
"""Extracts a profile name from a filename using a regex."""
if not regex_pattern:
return None
match = re.search(regex_pattern, path.name)
if match:
# Assume the first capturing group is the profile name
if match.groups():
return match.group(1)
return None
class StateManager:
"""Tracks statistics, manages rate limits, and persists state across runs."""
def __init__(self, policy_name):
self.state_file_path = Path(f"{policy_name}_state.json")
self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
self.lock = threading.RLock()
self.start_time = time.time()
self.events = []
self.state = {
'global_request_count': 0,
'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]}
'profile_request_counts': {}, # for client rotation
'profile_last_refresh_time': {}, # for client rotation
'proxy_last_finish_time': {}, # for per-proxy sleep
'processed_files': [], # For continuous download_only mode
# For dynamic profile cooldown strategy
'profile_cooldown_counts': {},
'profile_cooldown_sleep_until': {},
'profile_pool_size': 0,
'profile_run_suffix': None,
'worker_profile_generations': {}
}
self.stats_file_handle = None
self._load_state()
self.print_historical_summary()
self._open_stats_log()
def _load_state(self):
if not self.state_file_path.exists():
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
return
try:
with open(self.state_file_path, 'r', encoding='utf-8') as f:
self.state = json.load(f)
# Ensure keys exist
self.state.setdefault('global_request_count', 0)
self.state.setdefault('rate_limit_trackers', {})
self.state.setdefault('profile_request_counts', {})
self.state.setdefault('profile_last_refresh_time', {})
self.state.setdefault('proxy_last_finish_time', {})
self.state.setdefault('processed_files', [])
# For dynamic profile cooldown strategy
self.state.setdefault('profile_cooldown_counts', {})
self.state.setdefault('profile_cooldown_sleep_until', {})
self.state.setdefault('profile_pool_size', 0)
self.state.setdefault('profile_run_suffix', None)
self.state.setdefault('worker_profile_generations', {})
logger.info(f"Loaded state from {self.state_file_path}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
def _save_state(self):
with self.lock:
try:
with open(self.state_file_path, 'w', encoding='utf-8') as f:
json.dump(self.state, f, indent=2)
logger.info(f"Saved state to {self.state_file_path}")
except IOError as e:
logger.error(f"Could not save state to {self.state_file_path}: {e}")
def _open_stats_log(self):
try:
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
except IOError as e:
logger.error(f"Could not open stats file {self.stats_file_path}: {e}")
def close(self):
"""Saves state and closes file handles."""
self._save_state()
if self.stats_file_handle:
self.stats_file_handle.close()
self.stats_file_handle = None
def mark_file_as_processed(self, file_path):
"""Adds a file path to the list of processed files in the state."""
with self.lock:
# Using a list and checking for existence is fine for moderate numbers of files.
# A set isn't JSON serializable.
processed = self.state.setdefault('processed_files', [])
file_str = str(file_path)
if file_str not in processed:
processed.append(file_str)
def get_processed_files(self):
"""Returns a set of file paths that have been processed."""
with self.lock:
return set(self.state.get('processed_files', []))
def print_historical_summary(self):
"""Prints a summary based on the state loaded from disk, before new events."""
with self.lock:
now = time.time()
rate_trackers = self.state.get('rate_limit_trackers', {})
total_requests = self.state.get('global_request_count', 0)
if not rate_trackers and not total_requests:
logger.info("No historical data found in state file.")
return
logger.info("\n--- Summary From Previous Runs ---")
logger.info(f"Total info.json requests (all previous runs): {total_requests}")
if rate_trackers:
for key, timestamps in sorted(rate_trackers.items()):
# Time windows in seconds
windows = {
'last 10 min': 600,
'last 60 min': 3600,
'last 6 hours': 21600,
'last 24 hours': 86400
}
rates_str_parts = []
for name, seconds in windows.items():
count = sum(1 for ts in timestamps if now - ts <= seconds)
# Calculate rate in requests per minute
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)")
logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts))
logger.info("------------------------------------")
def log_event(self, event_data):
with self.lock:
event_data['timestamp'] = datetime.now().isoformat()
self.events.append(event_data)
if self.stats_file_handle:
self.stats_file_handle.write(json.dumps(event_data) + '\n')
self.stats_file_handle.flush()
def get_request_count(self):
with self.lock:
return self.state.get('global_request_count', 0)
def increment_request_count(self):
with self.lock:
self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1
def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None):
"""
Checks if a cumulative error rate has been exceeded.
If error_type is None, checks for any failure.
Returns the number of errors found if the threshold is met, otherwise 0.
"""
with self.lock:
now = time.time()
window_seconds = per_minutes * 60
if error_type:
recent_errors = [
e for e in self.events
if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
else: # Generic failure check
recent_errors = [
e for e in self.events
if not e.get('success') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
if len(recent_errors) >= max_errors:
return len(recent_errors)
return 0
def check_quality_degradation_rate(self, max_triggers, per_minutes):
"""
Checks if the quality degradation trigger rate has been exceeded.
Returns the number of triggers found if the threshold is met, otherwise 0.
"""
with self.lock:
now = time.time()
window_seconds = per_minutes * 60
recent_triggers = [
e for e in self.events
if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
if len(recent_triggers) >= max_triggers:
return len(recent_triggers)
return 0
def check_and_update_rate_limit(self, profile_name, policy):
"""
Checks if a request is allowed based on policy rate limits.
If allowed, updates the internal state. Returns True if allowed, False otherwise.
"""
with self.lock:
now = time.time()
gen_policy = policy.get('info_json_generation_policy', {})
rate_limits = gen_policy.get('rate_limits', {})
# Check per-IP limit
ip_limit = rate_limits.get('per_ip')
if ip_limit:
tracker_key = 'per_ip'
max_req = ip_limit.get('max_requests')
period_min = ip_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
# Filter out old timestamps
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning("Per-IP rate limit reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# Check per-profile limit
profile_limit = rate_limits.get('per_profile')
if profile_limit and profile_name:
tracker_key = f"profile_{profile_name}"
max_req = profile_limit.get('max_requests')
period_min = profile_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# If all checks pass, record the new request timestamp for all relevant trackers
if ip_limit and ip_limit.get('max_requests'):
self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now)
if profile_limit and profile_limit.get('max_requests') and profile_name:
self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now)
return True
def get_client_for_request(self, profile_name, gen_policy):
"""
Determines which client to use based on the client_rotation_policy.
Returns a tuple: (client_name, request_params_dict).
"""
with self.lock:
rotation_policy = gen_policy.get('client_rotation_policy')
# If no rotation policy, use the simple 'client' key.
if not rotation_policy:
client = gen_policy.get('client')
logger.info(f"Using client '{client}' for profile '{profile_name}'.")
req_params = gen_policy.get('request_params')
return client, req_params
# --- Rotation logic ---
now = time.time()
major_client = rotation_policy.get('major_client')
refresh_client = rotation_policy.get('refresh_client')
refresh_every = rotation_policy.get('refresh_every', {})
if not refresh_client or not refresh_every:
return major_client, rotation_policy.get('major_client_params')
should_refresh = False
# Check time-based refresh
refresh_minutes = refresh_every.get('minutes')
last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0)
if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60):
should_refresh = True
# Check request-count-based refresh
refresh_requests = refresh_every.get('requests')
request_count = self.state['profile_request_counts'].get(profile_name, 0)
if refresh_requests and request_count >= refresh_requests:
should_refresh = True
if should_refresh:
logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.")
self.state['profile_last_refresh_time'][profile_name] = now
self.state['profile_request_counts'][profile_name] = 0 # Reset counter
return refresh_client, rotation_policy.get('refresh_client_params')
else:
# Not refreshing, so increment request count for this profile
self.state['profile_request_counts'][profile_name] = request_count + 1
return major_client, rotation_policy.get('major_client_params')
def get_next_available_profile(self, policy):
"""
Finds or creates an available profile based on the dynamic cooldown policy.
Returns a profile name, or None if no profile is available.
"""
with self.lock:
now = time.time()
settings = policy.get('settings', {})
pm_policy = settings.get('profile_management')
if not pm_policy:
return None
prefix = pm_policy.get('prefix')
if not prefix:
logger.error("Profile management policy requires 'prefix'.")
return None
# Determine and persist the suffix for this run to ensure profile names are stable
run_suffix = self.state.get('profile_run_suffix')
if not run_suffix:
suffix_config = pm_policy.get('suffix')
if suffix_config == 'auto':
run_suffix = datetime.now().strftime('%Y%m%d%H%M')
else:
run_suffix = suffix_config or ''
self.state['profile_run_suffix'] = run_suffix
# Initialize pool size from policy if not already in state
if self.state.get('profile_pool_size', 0) == 0:
self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1)
max_reqs = pm_policy.get('max_requests_per_profile')
sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion')
# Loop until a profile is found or we decide we can't find one
while True:
# Try to find an existing, available profile
for i in range(self.state['profile_pool_size']):
profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}"
# Check if sleeping
sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0)
if now < sleep_until:
continue # Still sleeping
# Check if it needs to be put to sleep
req_count = self.state['profile_cooldown_counts'].get(profile_name, 0)
if max_reqs and req_count >= max_reqs:
sleep_duration_seconds = (sleep_mins or 0) * 60
self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds
self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time
logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.")
continue # Now sleeping, try next profile
# This profile is available
logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).")
return profile_name
# If we get here, no existing profile was available
if pm_policy.get('auto_expand_pool'):
new_profile_index = self.state['profile_pool_size']
self.state['profile_pool_size'] += 1
profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}"
logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'")
return profile_name
else:
# No available profiles and pool expansion is disabled
return None
def get_or_rotate_worker_profile(self, worker_id, policy):
"""
Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met.
This is used by the 'per_worker_with_rotation' profile mode.
"""
with self.lock:
pm_policy = policy.get('settings', {}).get('profile_management', {})
if not pm_policy:
logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.")
return f"error_profile_{worker_id}"
prefix = pm_policy.get('prefix')
if not prefix:
logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.")
return f"error_profile_{worker_id}"
max_reqs = pm_policy.get('max_requests_per_profile')
generations = self.state.setdefault('worker_profile_generations', {})
# worker_id is an int, but JSON keys must be strings
worker_id_str = str(worker_id)
current_gen = generations.get(worker_id_str, 0)
profile_name = f"{prefix}_{worker_id}_{current_gen}"
if not max_reqs: # No lifetime limit defined, so never rotate.
return profile_name
req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0)
if req_count >= max_reqs:
logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.")
new_gen = current_gen + 1
generations[worker_id_str] = new_gen
# The request counts for the old profile are implicitly left behind.
# The new profile will start with a count of 0.
profile_name = f"{prefix}_{worker_id}_{new_gen}"
return profile_name
def record_profile_request(self, profile_name):
"""Increments the request counter for a profile for the cooldown policy."""
with self.lock:
if not profile_name:
return
counts = self.state.setdefault('profile_cooldown_counts', {})
counts[profile_name] = counts.get(profile_name, 0) + 1
def record_proxy_usage(self, proxy_url):
"""Records a request timestamp for a given proxy URL for statistical purposes."""
if not proxy_url:
return
with self.lock:
now = time.time()
# Use a prefix to avoid collisions with profile names or other keys
tracker_key = f"proxy_{proxy_url}"
self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now)
def check_and_update_download_rate_limit(self, proxy_url, policy):
"""Checks download rate limits. Returns True if allowed, False otherwise."""
with self.lock:
now = time.time()
d_policy = policy.get('download_policy', {})
rate_limits = d_policy.get('rate_limits', {})
# Check per-IP limit
ip_limit = rate_limits.get('per_ip')
if ip_limit:
tracker_key = 'download_per_ip' # Use a distinct key
max_req = ip_limit.get('max_requests')
period_min = ip_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning("Per-IP download rate limit reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# Check per-proxy limit
proxy_limit = rate_limits.get('per_proxy')
if proxy_limit and proxy_url:
tracker_key = f"download_proxy_{proxy_url}"
max_req = proxy_limit.get('max_requests')
period_min = proxy_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# If all checks pass, record the new request timestamp for all relevant trackers
if ip_limit and ip_limit.get('max_requests'):
self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now)
if proxy_limit and proxy_limit.get('max_requests') and proxy_url:
self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now)
return True
def wait_for_proxy_cooldown(self, proxy_url, policy):
"""If a per-proxy sleep is defined, wait until the cooldown period has passed."""
with self.lock:
d_policy = policy.get('download_policy', {})
sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0)
if not proxy_url or not sleep_duration > 0:
return
last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0)
elapsed = time.time() - last_finish
if elapsed < sleep_duration:
time_to_sleep = sleep_duration - elapsed
logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.")
# Interruptible sleep
sleep_end_time = time.time() + time_to_sleep
while time.time() < sleep_end_time:
if shutdown_event.is_set():
logger.info("Shutdown requested during proxy cooldown sleep.")
break
time.sleep(0.2)
def update_proxy_finish_time(self, proxy_url):
"""Updates the last finish time for a proxy."""
with self.lock:
if not proxy_url:
return
self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time()
def print_summary(self, policy=None):
"""Print a summary of the test run."""
with self.lock:
# --- Cumulative Stats from State ---
now = time.time()
rate_trackers = self.state.get('rate_limit_trackers', {})
if rate_trackers:
logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---")
logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.")
fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')}
download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')}
def print_tracker_stats(trackers, tracker_type):
if not trackers:
logger.info(f"No historical {tracker_type} trackers found.")
return
logger.info(f"Historical {tracker_type} Trackers:")
for key, timestamps in sorted(trackers.items()):
windows = {
'last 10 min': 600, 'last 60 min': 3600,
'last 6 hours': 21600, 'last 24 hours': 86400
}
rates_str_parts = []
for name, seconds in windows.items():
count = sum(1 for ts in timestamps if now - ts <= seconds)
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)")
# Clean up key for display
display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips')
logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts))
print_tracker_stats(fetch_trackers, "Fetch Request")
print_tracker_stats(download_trackers, "Download Attempt")
if not self.events:
logger.info("\nNo new events were recorded in this session.")
return
duration = time.time() - self.start_time
fetch_events = [e for e in self.events if e.get('type') == 'fetch']
download_events = [e for e in self.events if e.get('type') != 'fetch']
logger.info("\n--- Test Summary (This Run) ---")
logger.info(f"Total duration: {duration:.2f} seconds")
logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}")
if policy:
logger.info("\n--- Test Configuration ---")
settings = policy.get('settings', {})
d_policy = policy.get('download_policy', {})
if settings.get('urls_file'):
logger.info(f"URL source file: {settings['urls_file']}")
if settings.get('info_json_dir'):
logger.info(f"Info.json source dir: {settings['info_json_dir']}")
if d_policy:
logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}")
if d_policy.get('downloader'):
logger.info(f"Downloader: {d_policy.get('downloader')}")
if d_policy.get('downloader_args'):
logger.info(f"Downloader args: {d_policy.get('downloader_args')}")
if d_policy.get('pause_before_download_seconds'):
logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s")
if d_policy.get('sleep_between_formats'):
sleep_cfg = d_policy.get('sleep_between_formats')
logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s")
if fetch_events:
total_fetches = len(fetch_events)
successful_fetches = sum(1 for e in fetch_events if e['success'])
cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled')
failed_fetches = total_fetches - successful_fetches - cancelled_fetches
logger.info("\n--- Fetch Summary (This Run) ---")
logger.info(f"Total info.json fetch attempts: {total_fetches}")
logger.info(f" - Successful: {successful_fetches}")
logger.info(f" - Failed: {failed_fetches}")
if cancelled_fetches > 0:
logger.info(f" - Cancelled: {cancelled_fetches}")
completed_fetches = successful_fetches + failed_fetches
if completed_fetches > 0:
success_rate = (successful_fetches / completed_fetches) * 100
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
elif total_fetches > 0:
logger.info("Success rate: N/A (no tasks completed)")
if duration > 1 and total_fetches > 0:
rpm = (total_fetches / duration) * 60
logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute")
if failed_fetches > 0:
error_counts = collections.Counter(
e.get('error_type', 'Unknown')
for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled'
)
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile'))
if profile_counts:
logger.info("Requests per profile:")
for profile, count in sorted(profile_counts.items()):
logger.info(f" - {profile}: {count}")
proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Requests per proxy:")
for proxy, count in sorted(proxy_counts.items()):
logger.info(f" - {proxy}: {count}")
if download_events:
total_attempts = len(download_events)
successes = sum(1 for e in download_events if e['success'])
cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled')
failures = total_attempts - successes - cancelled
# --- Profile Association for Download Events ---
download_profiles = [e.get('profile') for e in download_events]
# For download_only mode, we might need to fall back to regex extraction
# if the profile wasn't passed down (e.g., no profile grouping).
profile_regex = None
if policy:
settings = policy.get('settings', {})
if settings.get('mode') == 'download_only':
profile_regex = settings.get('profile_extraction_regex')
if profile_regex:
for i, e in enumerate(download_events):
if not download_profiles[i]: # If profile wasn't set in the event
path = Path(e.get('path', ''))
match = re.search(profile_regex, path.name)
if match and match.groups():
download_profiles[i] = match.group(1)
# Replace any remaining Nones with 'unknown_profile'
download_profiles = [p or 'unknown_profile' for p in download_profiles]
num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile'))
logger.info("\n--- Download Summary (This Run) ---")
if policy:
workers = policy.get('execution_control', {}).get('workers', 'N/A')
logger.info(f"Workers configured: {workers}")
logger.info(f"Profiles utilized for downloads: {num_profiles_used}")
logger.info(f"Total download attempts: {total_attempts}")
logger.info(f" - Successful: {successes}")
logger.info(f" - Failed: {failures}")
if cancelled > 0:
logger.info(f" - Cancelled: {cancelled}")
completed_downloads = successes + failures
if completed_downloads > 0:
success_rate = (successes / completed_downloads) * 100
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
elif total_attempts > 0:
logger.info("Success rate: N/A (no tasks completed)")
duration_hours = duration / 3600.0
if duration > 1 and total_attempts > 0:
dpm = (total_attempts / duration) * 60
logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute")
total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success'])
if total_bytes > 0:
logger.info(f"Total data downloaded: {format_size(total_bytes)}")
if failures > 0:
error_counts = collections.Counter(
e.get('error_type', 'Unknown')
for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled'
)
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
# Add profile to each download event for easier counting
for i, e in enumerate(download_events):
e['profile'] = download_profiles[i]
profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile'))
if profile_counts:
logger.info("Downloads per profile:")
for profile, count in sorted(profile_counts.items()):
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Downloads per proxy:")
for proxy, count in sorted(proxy_counts.items()):
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
logger.info("--------------------")
def _run_download_logic(source, info_json_content, policy, state_manager, profile_name=None):
"""Shared download logic for a single info.json."""
proxy_url = None
if info_json_content:
try:
info_data = json.loads(info_json_content)
proxy_url = info_data.get('_proxy_url')
except (json.JSONDecodeError, AttributeError):
logger.warning(f"[{get_display_name(source)}] Could not parse info.json to get proxy for download controls.")
if not state_manager.check_and_update_download_rate_limit(proxy_url, policy):
return []
state_manager.wait_for_proxy_cooldown(proxy_url, policy)
results = process_info_json_cycle(source, info_json_content, policy, state_manager, proxy_url=proxy_url, profile_name=profile_name)
state_manager.update_proxy_finish_time(proxy_url)
return results
def process_profile_task(profile_name, file_list, policy, state_manager, cycle_num):
"""Worker task for a profile, processing its files sequentially."""
logger.info(f"Worker {get_worker_id()} starting task for profile '{profile_name}' with {len(file_list)} files.")
all_results = []
for i, file_path in enumerate(file_list):
if shutdown_event.is_set():
logger.info(f"Shutdown requested, stopping task for profile '{profile_name}'.")
break
try:
with open(file_path, 'r', encoding='utf-8') as f:
info_json_content = f.read()
except (IOError, FileNotFoundError) as e:
logger.error(f"[{get_display_name(file_path)}] Could not read info.json file: {e}")
continue # Skip this file
results_for_file = _run_download_logic(file_path, info_json_content, policy, state_manager, profile_name=profile_name)
all_results.extend(results_for_file)
# Check for stop conditions after processing each file
should_stop_profile = False
for result in results_for_file:
if not result['success']:
s_conditions = policy.get('stop_conditions', {})
if s_conditions.get('on_failure') or \
(s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \
(s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'):
logger.info(f"Stopping further processing for profile '{profile_name}' due to failure.")
should_stop_profile = True
break
if should_stop_profile:
break
# Apply sleep between tasks for this profile
if i < len(file_list) - 1:
exec_control = policy.get('execution_control', {})
sleep_cfg = exec_control.get('sleep_between_tasks', {})
sleep_min = sleep_cfg.get('min_seconds', 0)
if sleep_min > 0:
sleep_max = sleep_cfg.get('max_seconds') or sleep_min
sleep_duration = random.uniform(sleep_min, sleep_max) if sleep_max > sleep_min else sleep_min
logger.debug(f"Profile '{profile_name}' sleeping for {sleep_duration:.2f}s before next file.")
# Interruptible sleep
sleep_end_time = time.time() + sleep_duration
while time.time() < sleep_end_time:
if shutdown_event.is_set():
break
time.sleep(0.2)
return all_results
def run_command(cmd, input_data=None, binary_stdout=False):
"""
Runs a command, captures its output, and returns status.
If binary_stdout is True, stdout is returned as bytes. Otherwise, both are decoded strings.
"""
logger.debug(f"Running command: {' '.join(cmd)}")
process = None
try:
# Always open in binary mode to handle both cases. We will decode later.
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE if input_data else None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid # Start in a new process group to isolate from terminal signals
)
with process_lock:
running_processes.add(process)
stdout_capture = []
stderr_capture = []
def read_pipe(pipe, capture_list, display_pipe=None):
"""Reads a pipe line by line (as bytes), appending to a list and optionally displaying."""
for line in iter(pipe.readline, b''):
capture_list.append(line)
if display_pipe:
# Decode for display
display_line = line.decode('utf-8', errors='replace')
display_pipe.write(display_line)
display_pipe.flush()
# We must read stdout and stderr in parallel to prevent deadlocks.
stdout_thread = threading.Thread(target=read_pipe, args=(process.stdout, stdout_capture))
# Display stderr in real-time as it often contains progress info.
stderr_thread = threading.Thread(target=read_pipe, args=(process.stderr, stderr_capture, sys.stderr))
stdout_thread.start()
stderr_thread.start()
# Handle stdin after starting to read outputs to avoid deadlocks.
if input_data:
try:
process.stdin.write(input_data.encode('utf-8'))
process.stdin.close()
except (IOError, BrokenPipeError):
# This can happen if the process exits quickly or doesn't read stdin.
logger.debug(f"Could not write to stdin for command: {' '.join(cmd)}. Process may have already exited.")
# Wait for the process to finish and for all output to be read.
retcode = process.wait()
stdout_thread.join()
stderr_thread.join()
stdout_bytes = b"".join(stdout_capture)
stderr_bytes = b"".join(stderr_capture)
stdout = stdout_bytes if binary_stdout else stdout_bytes.decode('utf-8', errors='replace')
stderr = stderr_bytes.decode('utf-8', errors='replace')
return retcode, stdout, stderr
except FileNotFoundError:
logger.error(f"Command not found: {cmd[0]}. Make sure it's in your PATH.")
return -1, "", f"Command not found: {cmd[0]}"
except Exception as e:
logger.error(f"An error occurred while running command: {' '.join(cmd)}. Error: {e}")
return -1, "", str(e)
finally:
if process:
with process_lock:
running_processes.discard(process)
def run_download_worker(info_json_path, info_json_content, format_to_download, policy, profile_name=None):
"""
Performs a single download attempt. Designed to be run in a worker thread.
"""
download_policy = policy.get('download_policy', {})
settings = policy.get('settings', {})
downloader = download_policy.get('downloader')
# Get script command from settings, with fallback to download_policy for old format.
script_cmd_str = settings.get('download_script')
if not script_cmd_str:
script_cmd_str = download_policy.get('script')
if script_cmd_str:
download_cmd = shlex.split(script_cmd_str)
elif downloader == 'aria2c_rpc':
download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'aria-rpc']
elif downloader == 'native-cli':
download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'cli']
else:
# Default to the new native-py downloader if downloader is 'native-py' or not specified.
download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'py']
download_cmd.extend(['-f', format_to_download])
if downloader == 'aria2c_rpc':
if download_policy.get('aria_host'):
download_cmd.extend(['--aria-host', str(download_policy['aria_host'])])
if download_policy.get('aria_port'):
download_cmd.extend(['--aria-port', str(download_policy['aria_port'])])
if download_policy.get('aria_secret'):
download_cmd.extend(['--aria-secret', str(download_policy['aria_secret'])])
if download_policy.get('output_dir'):
download_cmd.extend(['--output-dir', str(download_policy['output_dir'])])
if download_policy.get('aria_remote_dir'):
download_cmd.extend(['--remote-dir', str(download_policy['aria_remote_dir'])])
if download_policy.get('aria_fragments_dir'):
download_cmd.extend(['--fragments-dir', str(download_policy['aria_fragments_dir'])])
# For stress testing, waiting is the desired default to get a success/fail result.
# Allow disabling it by explicitly setting aria_wait: false in the policy.
if download_policy.get('aria_wait', True):
download_cmd.append('--wait')
if download_policy.get('auto_merge_fragments'):
download_cmd.append('--auto-merge-fragments')
if download_policy.get('remove_fragments_after_merge'):
download_cmd.append('--remove-fragments-after-merge')
if download_policy.get('cleanup'):
download_cmd.append('--cleanup')
if download_policy.get('purge_on_complete'):
download_cmd.append('--purge-on-complete')
downloader_args = download_policy.get('downloader_args')
proxy = download_policy.get('proxy')
if proxy:
# Note: proxy_rename is not supported for aria2c_rpc mode.
proxy_arg = f"--all-proxy {shlex.quote(str(proxy))}"
if downloader_args:
downloader_args = f"{downloader_args} {proxy_arg}"
else:
downloader_args = proxy_arg
if downloader_args:
# For aria2c_rpc, the downloader_args value is passed directly to the script's --downloader-args option.
download_cmd.extend(['--downloader-args', downloader_args])
elif downloader == 'native-cli':
# This is the logic for the legacy download_tool.py (yt-dlp CLI wrapper).
pause_seconds = download_policy.get('pause_before_download_seconds')
if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0:
download_cmd.extend(['--pause', str(pause_seconds)])
if download_policy.get('continue_downloads'):
download_cmd.append('--download-continue')
# Add proxy if specified directly in the policy
proxy = download_policy.get('proxy')
if proxy:
download_cmd.extend(['--proxy', str(proxy)])
proxy_rename = download_policy.get('proxy_rename')
if proxy_rename:
download_cmd.extend(['--proxy-rename', str(proxy_rename)])
extra_args = download_policy.get('extra_args')
if extra_args:
download_cmd.extend(shlex.split(extra_args))
# Note: 'downloader' here refers to yt-dlp's internal downloader, not our script.
# The policy key 'external_downloader' is more clear, but we support 'downloader' for backward compatibility.
ext_downloader = download_policy.get('external_downloader') or download_policy.get('downloader')
if ext_downloader and ext_downloader not in ['native-cli', 'native-py', 'aria2c_rpc']:
download_cmd.extend(['--downloader', str(ext_downloader)])
downloader_args = download_policy.get('downloader_args')
if downloader_args:
download_cmd.extend(['--downloader-args', str(downloader_args)])
if download_policy.get('merge_output_format'):
download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])])
if download_policy.get('merge_output_format'):
download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])])
if download_policy.get('cleanup'):
download_cmd.append('--cleanup')
else:
# This is the default logic for the new native-py downloader.
if download_policy.get('output_to_buffer'):
download_cmd.append('--output-buffer')
else:
# --output-dir is only relevant if not outputting to buffer.
if download_policy.get('output_dir'):
download_cmd.extend(['--output-dir', str(download_policy['output_dir'])])
if download_policy.get('temp_path'):
download_cmd.extend(['--temp-path', str(download_policy['temp_path'])])
if download_policy.get('continue_downloads'):
download_cmd.append('--download-continue')
pause_seconds = download_policy.get('pause_before_download_seconds')
if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0:
download_cmd.extend(['--pause', str(pause_seconds)])
proxy = download_policy.get('proxy')
if proxy:
download_cmd.extend(['--proxy', str(proxy)])
proxy_rename = download_policy.get('proxy_rename')
if proxy_rename:
download_cmd.extend(['--proxy-rename', str(proxy_rename)])
extra_args = download_policy.get('extra_args')
if extra_args:
download_cmd.extend(['--extra-ytdlp-args', str(extra_args)])
# Pass through downloader settings for yt-dlp to use
# e.g. to tell yt-dlp to use aria2c as its backend
ext_downloader = download_policy.get('external_downloader')
if ext_downloader:
download_cmd.extend(['--downloader', str(ext_downloader)])
downloader_args = download_policy.get('downloader_args')
if downloader_args:
download_cmd.extend(['--downloader-args', str(downloader_args)])
worker_id = get_worker_id()
display_name = get_display_name(info_json_path)
profile_log_part = f" [Profile: {profile_name}]" if profile_name else ""
log_prefix = f"[Worker {worker_id}]{profile_log_part} [{display_name} @ {format_to_download}]"
logger.info(f"{log_prefix} Kicking off download process...")
temp_info_file_path = None
try:
if isinstance(info_json_path, Path) and info_json_path.exists():
# The info.json is already in a file, pass its path directly.
download_cmd.extend(['--load-info-json', str(info_json_path)])
else:
# The info.json content is in memory, so write it to a temporary file.
import tempfile
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as temp_f:
temp_f.write(info_json_content)
temp_info_file_path = temp_f.name
download_cmd.extend(['--load-info-json', temp_info_file_path])
cmd_str_for_log = ' '.join(shlex.quote(s) for s in download_cmd)
logger.info(f"{log_prefix} Running download command: {cmd_str_for_log}")
output_to_buffer = download_policy.get('output_to_buffer', False)
retcode, stdout, stderr = run_command(download_cmd, binary_stdout=output_to_buffer)
finally:
if temp_info_file_path and os.path.exists(temp_info_file_path):
os.unlink(temp_info_file_path)
is_403_error = "HTTP Error 403" in stderr
is_timeout_error = "Read timed out" in stderr
output_to_buffer = download_policy.get('output_to_buffer', False)
result = {
'type': 'download',
'path': str(info_json_path),
'format': format_to_download,
'success': retcode == 0,
'error_type': None,
'details': '',
'downloaded_bytes': 0,
'profile': profile_name
}
if retcode == 0:
details_str = "OK"
size_in_bytes = 0
if output_to_buffer:
# The most accurate size is the length of the stdout buffer.
size_in_bytes = len(stdout) # stdout is bytes
details_str += f" (Buffered {format_size(size_in_bytes)})"
else:
size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr)
if size_match:
value = float(size_match.group(1))
unit = size_match.group(2)
multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3}
size_in_bytes = int(value * multipliers.get(unit, 1))
details_str += f" ({size_match.group(1)}{unit})"
result['downloaded_bytes'] = size_in_bytes
result['details'] = details_str
else:
# Check both stdout and stderr for error messages, as logging might be directed to stdout.
full_output = f"{stdout}\n{stderr}"
error_lines = [line for line in full_output.strip().split('\n') if 'ERROR:' in line]
result['details'] = error_lines[-1].strip() if error_lines else "Unknown error"
if is_403_error:
result['error_type'] = 'HTTP 403'
elif is_timeout_error:
result['error_type'] = 'Timeout'
else:
result['error_type'] = f'Exit Code {retcode}'
return result
def process_info_json_cycle(path, content, policy, state_manager, proxy_url=None, profile_name=None):
"""
Processes one info.json file for one cycle, downloading selected formats.
"""
results = []
display_name = get_display_name(path)
d_policy = policy.get('download_policy', {})
s_conditions = policy.get('stop_conditions', {})
format_selection = d_policy.get('formats', '')
try:
info_data = json.loads(content)
available_formats = [f['format_id'] for f in info_data.get('formats', [])]
if not available_formats:
logger.warning(f"[{display_name}] No formats found in info.json. Skipping.")
return []
formats_to_test = []
if format_selection == 'all':
formats_to_test = available_formats
elif format_selection.startswith('random:'):
percent = float(format_selection.split(':')[1].rstrip('%'))
count = max(1, int(len(available_formats) * (percent / 100.0)))
formats_to_test = random.sample(available_formats, k=count)
elif format_selection.startswith('random_from:'):
choices = [f.strip() for f in format_selection.split(':', 1)[1].split(',')]
valid_choices = [f for f in choices if f in available_formats]
if valid_choices:
formats_to_test = [random.choice(valid_choices)]
else:
requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()]
formats_to_test = []
for req_fmt in requested_formats:
# Check for exact match first
if req_fmt in available_formats:
formats_to_test.append(req_fmt)
continue
# If no exact match, check for formats that start with this ID + '-'
# e.g., req_fmt '140' should match '140-0'
prefix_match = f"{req_fmt}-"
first_match = next((af for af in available_formats if af.startswith(prefix_match)), None)
if first_match:
logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.")
formats_to_test.append(first_match)
else:
# This could be a complex selector like 'bestvideo' or '299/298', so keep it.
if req_fmt not in available_formats:
logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats.")
formats_to_test.append(req_fmt)
except json.JSONDecodeError:
logger.error(f"[{display_name}] Failed to parse info.json. Skipping.")
return []
for i, format_id in enumerate(formats_to_test):
if shutdown_event.is_set():
logger.info(f"Shutdown requested, stopping further format tests for {display_name}.")
break
# Check if the format URL is expired before attempting to download
format_details = next((f for f in info_data.get('formats', []) if f.get('format_id') == format_id), None)
if format_details and 'url' in format_details:
parsed_url = urlparse(format_details['url'])
query_params = parse_qs(parsed_url.query)
expire_ts_str = query_params.get('expire', [None])[0]
if expire_ts_str and expire_ts_str.isdigit():
expire_ts = int(expire_ts_str)
if expire_ts < time.time():
logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.")
result = {
'type': 'download', 'path': str(path), 'format': format_id,
'success': True, 'error_type': 'Skipped',
'details': 'Download URL is expired', 'downloaded_bytes': 0
}
if proxy_url:
result['proxy_url'] = proxy_url
state_manager.log_event(result)
results.append(result)
continue # Move to the next format
result = run_download_worker(path, content, format_id, policy, profile_name=profile_name)
if proxy_url:
result['proxy_url'] = proxy_url
state_manager.log_event(result)
results.append(result)
worker_id = get_worker_id()
status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})"
profile_log_part = f" [Profile: {profile_name}]" if profile_name else ""
logger.info(f"[Worker {worker_id}]{profile_log_part} Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}")
if not result['success']:
if s_conditions.get('on_failure') or \
(s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \
(s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'):
logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.")
break
sleep_cfg = d_policy.get('sleep_between_formats', {})
sleep_min = sleep_cfg.get('min_seconds', 0)
if sleep_min > 0 and i < len(formats_to_test) - 1:
sleep_max = sleep_cfg.get('max_seconds') or sleep_min
if sleep_max > sleep_min:
sleep_duration = random.uniform(sleep_min, sleep_max)
else:
sleep_duration = sleep_min
logger.debug(f"Sleeping for {sleep_duration:.2f}s between formats for {display_name}.")
# Interruptible sleep
sleep_end_time = time.time() + sleep_duration
while time.time() < sleep_end_time:
if shutdown_event.is_set():
break
time.sleep(0.2)
return results
def update_dict(d, u):
"""Recursively update a dictionary."""
for k, v in u.items():
if isinstance(v, collections.abc.Mapping):
d[k] = update_dict(d.get(k, {}), v)
else:
d[k] = v
return d
def load_policy(policy_file, policy_name=None):
"""Load a policy from a YAML file."""
try:
with open(policy_file, 'r', encoding='utf-8') as f:
# If a policy name is given, look for that specific document
if policy_name:
docs = list(yaml.safe_load_all(f))
for doc in docs:
if isinstance(doc, dict) and doc.get('name') == policy_name:
return doc
raise ValueError(f"Policy '{policy_name}' not found in {policy_file}")
# Otherwise, load the first document
return yaml.safe_load(f)
except (IOError, yaml.YAMLError, ValueError) as e:
logger.error(f"Failed to load policy file {policy_file}: {e}")
sys.exit(1)
def apply_overrides(policy, overrides):
"""Apply command-line overrides to the policy."""
for override in overrides:
try:
key, value = override.split('=', 1)
keys = key.split('.')
# Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar
if (value.startswith('[') and value.endswith(']')) or \
(value.startswith('{') and value.endswith('}')):
try:
value = yaml.safe_load(value)
except yaml.YAMLError:
logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.")
else:
# Try to auto-convert scalar value type
if value.lower() == 'true':
value = True
elif value.lower() == 'false':
value = False
elif value.lower() == 'null':
value = None
else:
try:
value = int(value)
except ValueError:
try:
value = float(value)
except ValueError:
pass # Keep as string
d = policy
for k in keys[:-1]:
d = d.setdefault(k, {})
d[keys[-1]] = value
except ValueError:
logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.")
sys.exit(1)
return policy
def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None):
"""Prints a human-readable summary of the effective policy."""
logger.info(f"--- Effective Policy: {name} ---")
settings = policy.get('settings', {})
exec_control = policy.get('execution_control', {})
logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
if profile_names:
num_profiles = len(profile_names)
logger.info(f"Profiles found: {num_profiles}")
if num_profiles > 0:
# Sort profiles for consistent display, show top 10
sorted_profiles = sorted(profile_names)
profiles_to_show = sorted_profiles[:10]
logger.info(f" (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})")
workers_display = str(exec_control.get('workers', 1))
if original_workers_setting == 'auto':
workers_display = f"auto (calculated: {workers_display})"
logger.info(f"Workers: {workers_display}")
sleep_cfg = exec_control.get('sleep_between_tasks', {})
sleep_min = sleep_cfg.get('min_seconds')
if sleep_min is not None:
sleep_max = sleep_cfg.get('max_seconds') or sleep_min
if sleep_max > sleep_min:
logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)")
else:
logger.info(f"Sleep between tasks (per worker): {sleep_min}s")
run_until = exec_control.get('run_until', {})
run_conditions = []
if 'minutes' in run_until:
run_conditions.append(f"for {run_until['minutes']} minutes")
if 'requests' in run_until:
run_conditions.append(f"until {run_until['requests']} total requests")
if 'cycles' in run_until:
run_conditions.append(f"for {run_until['cycles']} cycles")
if run_conditions:
logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.")
if 'minutes' in run_until and 'cycles' not in run_until:
logger.info("Will continuously cycle through sources until time limit is reached.")
else:
logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.")
logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).")
# --- Rate Calculation ---
if sources:
workers = exec_control.get('workers', 1)
num_sources = len(profile_names) if profile_names else len(sources)
min_sleep = sleep_cfg.get('min_seconds', 0)
max_sleep = sleep_cfg.get('max_seconds') or min_sleep
avg_sleep_per_task = (min_sleep + max_sleep) / 2
# Assume an average task duration. This is a major assumption.
mode = settings.get('mode', 'full_stack')
assumptions = exec_control.get('assumptions', {})
assumed_fetch_duration = 0
if mode in ['full_stack', 'fetch_only']:
assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3)
assumed_download_duration = 0
if mode in ['full_stack', 'download_only']:
# This assumes the total time to download all formats for a single source.
assumed_download_duration = assumptions.get('download_task_duration', 60)
total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration
if workers > 0 and total_assumed_task_duration > 0:
total_time_per_task = total_assumed_task_duration + avg_sleep_per_task
tasks_per_minute_per_worker = 60 / total_time_per_task
total_tasks_per_minute = tasks_per_minute_per_worker * workers
logger.info("--- Rate Estimation ---")
logger.info(f"Source count: {num_sources}")
if mode in ['full_stack', 'fetch_only']:
logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)")
if mode in ['full_stack', 'download_only']:
logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)")
logger.info(" (Note: This assumes total time for all formats per source)")
logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s")
logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)")
target_rate_cfg = exec_control.get('target_rate', {})
target_reqs = target_rate_cfg.get('requests')
target_mins = target_rate_cfg.get('per_minutes')
if target_reqs and target_mins:
target_rpm = target_reqs / target_mins
logger.info(f"Target rate: {target_rpm:.2f} tasks/minute")
if total_tasks_per_minute < target_rpm * 0.8:
logger.warning("Warning: Expected rate is significantly lower than target rate.")
logger.warning("Consider increasing workers, reducing sleep, or checking task performance.")
logger.info("---------------------------------")
time.sleep(2) # Give user time to read
def add_stress_policy_parser(subparsers):
"""Add the parser for the 'stress-policy' command."""
parser = subparsers.add_parser(
'stress-policy',
description="The primary, policy-driven stress-testing orchestrator.\nIt runs complex, multi-stage stress tests based on a YAML policy file.\nUse '--list-policies' to see available pre-configured scenarios.\n\nModes supported:\n- full_stack: Generate info.json and then download from it.\n- fetch_only: Only generate info.json files.\n- download_only: Only download from existing info.json files.",
formatter_class=argparse.RawTextHelpFormatter,
help='Run advanced, policy-driven stress tests (recommended).',
epilog="""
Examples:
1. Fetch info.jsons for a TV client with a single profile and a rate limit:
ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\
--policy-name tv_downgraded_single_profile \\
--set settings.urls_file=my_urls.txt \\
--set execution_control.run_until.minutes=30
# This runs a 'fetch_only' test using the 'tv_downgraded' client. It uses a single,
# static profile for all requests and enforces a safety limit of 450 requests per hour.
2. Fetch info.jsons for an Android client using cookies for authentication:
ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\
--policy-name android_sdkless_with_cookies \\
--set settings.urls_file=my_urls.txt \\
--set info_json_generation_policy.request_params.cookies_file_path=/path/to/my_cookies.txt
# This demonstrates an authenticated 'fetch_only' test. It passes the path to a
# Netscape cookie file, which the server will use for the requests.
3. Download from a folder of info.jsons, grouped by profile, with auto-workers:
ytops-client stress-policy --policy policies/2_download_only_policies.yaml \\
--policy-name basic_profile_aware_download \\
--set settings.info_json_dir=/path/to/my/infojsons
# This runs a 'download_only' test. It scans a directory, extracts profile names from
# the filenames (e.g., 'tv_user_1' from '...-VIDEOID-tv_user_1.json'), and groups
# them. 'workers=auto' sets the number of workers to the number of unique profiles found.
4. Full-stack test with multiple workers and profile rotation:
ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\
--policy-name tv_simply_profile_rotation \\
--set settings.urls_file=my_urls.txt \\
--set execution_control.workers=4 \\
--set settings.profile_management.max_requests_per_profile=500
# This runs a 'full_stack' test with 4 parallel workers. Each worker gets a unique
# profile (e.g., tv_simply_user_0_0, tv_simply_user_1_0, etc.). After a profile is
# used 500 times, it is retired, and a new "generation" is created (e.g., tv_simply_user_0_1).
5. Full-stack authenticated test with a pool of profiles and corresponding cookie files:
ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\
--policy-name mweb_multi_profile_with_cookies \\
--set settings.urls_file=my_urls.txt \\
--set settings.profile_management.cookie_files='["/path/c1.txt","/path/c2.txt"]'
# This runs a 'full_stack' test using a pool of profiles (e.g., mweb_user_0, mweb_user_1).
# It uses the 'cookie_files' list to assign a specific cookie file to each profile in the
# pool, enabling multi-account authenticated testing. Note the JSON/YAML list format for the override.
6. Full-stack test submitting downloads to an aria2c RPC server:
ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\
--policy-name tv_simply_profile_rotation_aria2c_rpc \\
--set settings.urls_file=my_urls.txt \\
--set download_policy.aria_host=192.168.1.100 \\
--set download_policy.aria_port=6801
# This runs a test where downloads are not performed by the worker itself, but are
# sent to a remote aria2c daemon. The policy specifies 'downloader: aria2c_rpc'
# and provides connection details. This is useful for offloading download traffic.
--------------------------------------------------------------------------------
Overridable Policy Parameters via --set:
Key Description
-------------------------------------- ------------------------------------------------
[settings]
settings.mode Test mode: 'full_stack', 'fetch_only', or 'download_only'.
settings.urls_file Path to file with URLs/video IDs.
settings.info_json_dir Path to directory with existing info.json files.
settings.profile_extraction_regex For 'download_only' mode, a regex to extract profile names from info.json filenames. The first capture group is used as the profile name. E.g., '.*-(.*?).json'. This enables profile-aware sequential downloading.
settings.info_json_dir_sample_percent Randomly sample this %% of files from the directory (for 'once' scan mode).
settings.directory_scan_mode For 'download_only': 'once' (default) or 'continuous' to watch for new files.
settings.mark_processed_files For 'continuous' scan mode: if true, rename processed files to '*.<timestamp>.processed' to avoid reprocessing.
settings.max_files_per_cycle For 'continuous' scan mode: max new files to process per cycle.
settings.sleep_if_no_new_files_seconds For 'continuous' scan mode: seconds to sleep if no new files are found (default: 10).
settings.profile_prefix (Legacy) Prefix for profile names (e.g., 'test_user').
settings.profile_pool (Legacy) Size of the profile pool.
settings.profile_mode Profile strategy. 'per_request' (legacy), 'per_worker' (legacy), or 'per_worker_with_rotation' (requires profile_management).
settings.info_json_script Command to run the info.json generation script (e.g., 'bin/ytops-client get-info').
settings.save_info_json_dir If set, save all successfully generated info.json files to this directory.
[settings.profile_management] (New, preferred method for profile control)
profile_management.prefix Prefix for profile names (e.g., 'dyn_user').
profile_management.suffix Suffix for profile names. Set to 'auto' for a timestamp, or provide a string.
profile_management.initial_pool_size The number of profiles to start with.
profile_management.auto_expand_pool If true, create new profiles when the initial pool is exhausted (all sleeping).
profile_management.max_requests_per_profile Max requests a profile can make before it must 'sleep'.
profile_management.sleep_minutes_on_exhaustion How many minutes a profile 'sleeps' after hitting its request limit.
profile_management.cookie_files A list of paths to cookie files. Used to assign a unique cookie file to each profile in a pool.
[execution_control]
execution_control.workers Number of parallel worker threads. Set to "auto" to calculate from target_rate or number of profiles.
execution_control.auto_workers_max The maximum number of workers to use when 'workers' is 'auto' in profile-aware download mode (default: 8).
execution_control.target_rate.requests Target requests for 'auto' workers calculation.
execution_control.target_rate.per_minutes Period in minutes for target_rate.
execution_control.run_until.minutes Stop test after N minutes. Will continuously cycle through sources.
execution_control.run_until.cycles Stop test after N cycles. A cycle is one full pass through all sources.
execution_control.run_until.requests Stop test after N total info.json requests (cumulative across runs).
execution_control.sleep_between_tasks.min_seconds Min sleep time between tasks, per worker.
[info_json_generation_policy]
info_json_generation_policy.client Client to use (e.g., 'mweb', 'tv_camoufox').
info_json_generation_policy.auth_host Host for the auth/Thrift service.
info_json_generation_policy.auth_port Port for the auth/Thrift service.
info_json_generation_policy.assigned_proxy_url A specific proxy to use for a request, overriding the server's proxy pool.
info_json_generation_policy.proxy_rename Regex substitution for the assigned proxy URL (e.g., 's/old/new/').
info_json_generation_policy.command_template A full command template for the info.json script. Overrides other keys.
info_json_generation_policy.rate_limits.per_ip.max_requests Max requests for the given time period from one IP.
info_json_generation_policy.rate_limits.per_ip.per_minutes Time period in minutes for the per_ip rate limit.
info_json_generation_policy.rate_limits.per_profile.max_requests Max requests for a single profile in a time period.
info_json_generation_policy.rate_limits.per_profile.per_minutes Time period in minutes for the per_profile rate limit.
info_json_generation_policy.client_rotation_policy.major_client The primary client to use for most requests.
info_json_generation_policy.client_rotation_policy.refresh_client The client to use periodically to refresh context.
info_json_generation_policy.client_rotation_policy.refresh_every.requests Trigger refresh client after N requests for a profile.
[download_policy]
download_policy.formats Formats to download (e.g., '18,140', 'random:50%%').
download_policy.downloader Orchestrator script to use: 'native-py' (default, Python lib), 'native-cli' (legacy CLI wrapper), or 'aria2c_rpc'.
download_policy.external_downloader For 'native-py' or default, the backend yt-dlp should use (e.g., 'aria2c', 'native').
download_policy.downloader_args Arguments for the external_downloader. For yt-dlp, e.g., 'aria2c:-x 8'.
download_policy.merge_output_format Container to merge to (e.g., 'mkv'). Defaults to 'mp4' via cli.config.
download_policy.temp_path For 'native-py', path to a directory for temporary files (e.g., a RAM disk like /dev/shm).
download_policy.output_to_buffer For 'native-py', download to an in-memory buffer and pipe to stdout instead of saving to a file (true/false). Best for single-file formats.
download_policy.proxy Proxy for direct downloads (e.g., "socks5://127.0.0.1:1080").
download_policy.proxy_rename Regex substitution for the proxy URL (e.g., 's/old/new/').
download_policy.pause_before_download_seconds Pause for N seconds before starting each download attempt.
download_policy.continue_downloads Enable download continuation (true/false).
download_policy.cleanup After success: for native downloaders, rename and truncate file to 0 bytes; for 'aria2c_rpc', remove file(s) from filesystem.
download_policy.extra_args A string of extra arguments for the download script (e.g., "--limit-rate 5M").
download_policy.sleep_per_proxy_seconds Cooldown in seconds between downloads on the same proxy.
download_policy.rate_limits.per_proxy.max_requests Max downloads for a single proxy in a time period.
download_policy.rate_limits.per_proxy.per_minutes Time period in minutes for the per_proxy download rate limit.
# For downloader: 'aria2c_rpc'
download_policy.aria_host Hostname of the aria2c RPC server.
download_policy.aria_port Port of the aria2c RPC server.
download_policy.aria_secret Secret token for the aria2c RPC server.
download_policy.aria_wait Wait for aria2c downloads to complete (true/false).
download_policy.cleanup Remove downloaded file(s) from the filesystem on success. Requires script access to the download directory.
download_policy.purge_on_complete On success, purge ALL completed/failed downloads from aria2c history. Use as a workaround for older aria2c versions where targeted removal fails.
download_policy.output_dir Output directory for downloads.
download_policy.aria_remote_dir The absolute download path on the remote aria2c host.
download_policy.aria_fragments_dir The local path to find fragments for merging (if different from output_dir).
download_policy.auto_merge_fragments For fragmented downloads, automatically merge parts after download (true/false). Requires aria_wait=true.
download_policy.remove_fragments_after_merge For fragmented downloads, delete fragment files after a successful merge (true/false). Requires auto_merge_fragments=true.
[stop_conditions]
stop_conditions.on_failure Stop on any download failure (true/false).
stop_conditions.on_http_403 Stop on any HTTP 403 error (true/false).
stop_conditions.on_error_rate.max_errors Stop test if more than N errors (of any type) occur within the time period.
stop_conditions.on_error_rate.per_minutes Time period in minutes for the error rate calculation.
stop_conditions.on_cumulative_403.max_errors Stop test if more than N HTTP 403 errors occur within the time period.
stop_conditions.on_cumulative_403.per_minutes Time period in minutes for the cumulative 403 calculation.
stop_conditions.on_quality_degradation.trigger_if_missing_formats A format ID or comma-separated list of IDs. Triggers if any are missing.
stop_conditions.on_quality_degradation.max_triggers Stop test if quality degradation is detected N times.
stop_conditions.on_quality_degradation.per_minutes Time period in minutes for the quality degradation calculation.
--------------------------------------------------------------------------------
"""
)
parser.add_argument('--policy', help='Path to the YAML policy file. Required unless --list-policies is used.')
parser.add_argument('--policy-name', help='Name of the policy to run from a multi-policy file (if it contains "---" separators).')
parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.')
parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.')
parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)")
parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.')
parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.')
return parser
def list_policies():
"""Scans the policies directory and prints a list of available policies."""
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.abspath(os.path.join(script_dir, '..'))
policies_dir = os.path.join(project_root, 'policies')
if not os.path.isdir(policies_dir):
print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr)
return 1
print("Available Policies:")
print("=" * 20)
policy_files = sorted(Path(policies_dir).glob('*.yaml'))
if not policy_files:
print("No policy files (.yaml) found.")
return 0
for policy_file in policy_files:
print(f"\n--- File: {policy_file.relative_to(project_root)} ---")
try:
with open(policy_file, 'r', encoding='utf-8') as f:
content = f.read()
# Split into documents. The separator is a line that is exactly '---'.
documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE)
found_any_in_file = False
for doc in documents:
doc = doc.strip()
if not doc:
continue
lines = doc.split('\n')
policy_name = None
description_lines = []
# Find name and description
for i, line in enumerate(lines):
if line.strip().startswith('name:'):
policy_name = line.split(':', 1)[1].strip()
# Look backwards for comments
j = i - 1
current_desc_block = []
while j >= 0 and lines[j].strip().startswith('#'):
comment = lines[j].strip().lstrip('#').strip()
current_desc_block.insert(0, comment)
j -= 1
if current_desc_block:
description_lines = current_desc_block
break
if policy_name:
found_any_in_file = True
print(f" - Name: {policy_name}")
if description_lines:
# Heuristic to clean up "Policy: " prefix
if description_lines[0].lower().startswith('policy:'):
description_lines[0] = description_lines[0][len('policy:'):].strip()
print(f" Description: {description_lines[0]}")
for desc_line in description_lines[1:]:
print(f" {desc_line}")
else:
print(" Description: (No description found)")
relative_path = policy_file.relative_to(project_root)
print(f" Usage: --policy {relative_path} --policy-name {policy_name}")
if not found_any_in_file:
print(" (No named policies found in this file)")
except Exception as e:
print(f" Error parsing {policy_file.name}: {e}")
return 0
def main_stress_policy(args):
"""Main logic for the 'stress-policy' command."""
if args.list_policies:
return list_policies()
if not args.policy:
print("Error: --policy is required unless using --list-policies.", file=sys.stderr)
return 1
# Handle --show-overrides early, as it doesn't run the test.
if args.show_overrides:
policy = load_policy(args.policy, args.policy_name)
if not policy:
return 1 # load_policy prints its own error
print_policy_overrides(policy)
return 0
log_level = logging.DEBUG if args.verbose else logging.INFO
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s'
date_format = None if args.verbose else '%H:%M:%S'
logging.basicConfig(level=log_level, format=log_format, datefmt=date_format, stream=sys.stdout)
policy = load_policy(args.policy, args.policy_name)
policy = apply_overrides(policy, args.set)
policy_name = policy.get('name', args.policy_name or Path(args.policy).stem)
state_manager = StateManager(policy_name)
# --- Graceful shutdown handler ---
def shutdown_handler(signum, frame):
if not shutdown_event.is_set():
logger.info(f"\nSignal {signum} received, shutting down gracefully...")
shutdown_event.set()
# Save state immediately to prevent loss on interrupt.
logger.info("Attempting to save state before shutdown...")
state_manager.close()
# Kill running subprocesses to unblock workers
with process_lock:
if running_processes:
logger.info(f"Terminating {len(running_processes)} running subprocess(es)...")
for p in running_processes:
try:
# Kill the entire process group to ensure child processes (like yt-dlp) are terminated.
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass # Process already finished or we lack permissions
logger.info("Subprocesses terminated. Waiting for workers to finish. Press Ctrl+C again to force exit.")
else:
logger.info("Second signal received, forcing exit.")
# Use os._exit for a hard exit that doesn't run cleanup handlers,
# which can deadlock if locks are held.
os._exit(1)
signal.signal(signal.SIGINT, shutdown_handler)
signal.signal(signal.SIGTERM, shutdown_handler)
settings = policy.get('settings', {})
# --- Load sources based on mode ---
mode = settings.get('mode', 'full_stack')
sources = [] # This will be a list of URLs or Path objects
if mode in ['full_stack', 'fetch_only']:
urls_file = settings.get('urls_file')
if not urls_file:
logger.error("Policy mode requires 'settings.urls_file'.")
return 1
try:
with open(urls_file, 'r', encoding='utf-8') as f:
content = f.read()
try:
data = json.loads(content)
if isinstance(data, list) and all(isinstance(item, str) for item in data):
sources = data
logger.info(f"Loaded {len(sources)} URLs/IDs from JSON array in {urls_file}.")
else:
logger.error(f"URL file '{urls_file}' is valid JSON but not an array of strings.")
return 1
except json.JSONDecodeError:
sources = [line.strip() for line in content.splitlines() if line.strip()]
logger.info(f"Loaded {len(sources)} URLs/IDs from text file {urls_file}.")
except IOError as e:
logger.error(f"Failed to read urls_file {urls_file}: {e}")
return 1
# Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting
cleaned_sources = []
for source in sources:
cleaned_source = source.strip().rstrip(',').strip().strip('\'"[]').strip()
if cleaned_source:
cleaned_sources.append(cleaned_source)
if len(cleaned_sources) != len(sources):
logger.info(f"Cleaned URL list, removed {len(sources) - len(cleaned_sources)} empty or invalid entries.")
sources = cleaned_sources
elif mode == 'download_only':
# If not in continuous mode, load sources once at the start.
# In continuous mode, `sources` is populated at the start of each cycle.
if settings.get('directory_scan_mode') != 'continuous':
info_json_dir = settings.get('info_json_dir')
if not info_json_dir:
logger.error("Policy mode 'download_only' requires 'settings.info_json_dir'.")
return 1
try:
all_files = sorted(Path(info_json_dir).glob('*.json'))
sample_percent = settings.get('info_json_dir_sample_percent')
if sample_percent and 0 < sample_percent <= 100:
sample_count = int(len(all_files) * (sample_percent / 100.0))
num_to_sample = min(len(all_files), max(1, sample_count))
sources = random.sample(all_files, k=num_to_sample)
logger.info(f"Randomly sampled {len(sources)} files ({sample_percent}%) from {info_json_dir}")
else:
sources = all_files
except (IOError, FileNotFoundError) as e:
logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}")
return 1
# In continuous download mode, sources are loaded inside the loop, so we skip this check.
if settings.get('directory_scan_mode') != 'continuous' and not sources:
logger.error("No sources (URLs or info.json files) to process. Exiting.")
return 1
# --- Group sources by profile if in download_only mode with regex ---
profile_tasks = None
task_items = sources # Default to list of sources
profile_extraction_regex = settings.get('profile_extraction_regex')
if mode == 'download_only' and profile_extraction_regex:
logger.info(f"Grouping info.json files by profile using regex: {profile_extraction_regex}")
profile_tasks = collections.defaultdict(list)
for source_path in sources:
profile_name = get_profile_from_filename(source_path, profile_extraction_regex)
if profile_name:
profile_tasks[profile_name].append(source_path)
else:
# Assign to a default profile if no match
profile_tasks['unmatched_profile'].append(source_path)
num_profiles = len(profile_tasks)
logger.info(f"Found {num_profiles} unique profiles. Tasks will be processed sequentially per profile.")
# The new "sources" for the purpose of task distribution are the profiles.
task_items = list(profile_tasks.items())
# --- Auto-calculate workers if needed ---
exec_control = policy.get('execution_control', {})
original_workers_setting = exec_control.get('workers')
if original_workers_setting == 'auto':
if mode == 'download_only' and profile_tasks is not None:
num_profiles = len(profile_tasks)
# Use auto_workers_max from policy, with a default of 8.
max_workers = exec_control.get('auto_workers_max', 8)
num_workers = min(num_profiles, max_workers)
exec_control['workers'] = max(1, num_workers)
logger.info(f"Calculated 'auto' workers based on {num_profiles} profiles (max: {max_workers}): {exec_control['workers']}")
else:
target_rate_cfg = exec_control.get('target_rate', {})
target_reqs = target_rate_cfg.get('requests')
target_mins = target_rate_cfg.get('per_minutes')
if target_reqs and target_mins and sources:
target_rpm = target_reqs / target_mins
num_sources = len(sources)
sleep_cfg = exec_control.get('sleep_between_tasks', {})
avg_sleep = (sleep_cfg.get('min_seconds', 0) + sleep_cfg.get('max_seconds', 0)) / 2
assumed_task_duration = 12 # Must match assumption in display_effective_policy
# Formula: workers = (total_work_seconds) / (total_time_for_work)
# total_time_for_work is derived from the target rate:
# (total_cycle_time) = (60 * num_sources) / target_rpm
# total_time_for_work = total_cycle_time - avg_sleep
work_time_available = (60 * num_sources / target_rpm) - avg_sleep
if work_time_available <= 0:
# The sleep time alone makes the target rate impossible.
# Set workers to max parallelism as a best-effort.
num_workers = num_sources
logger.warning(f"Target rate of {target_rpm} req/min is likely unachievable due to sleep time of {avg_sleep}s.")
logger.warning(f"Setting workers to max parallelism ({num_workers}) as a best effort.")
else:
total_work_seconds = num_sources * assumed_task_duration
num_workers = total_work_seconds / work_time_available
calculated_workers = max(1, int(num_workers + 0.99)) # Ceiling
exec_control['workers'] = calculated_workers
logger.info(f"Calculated 'auto' workers based on target rate: {calculated_workers}")
else:
logger.warning("Cannot calculate 'auto' workers: 'target_rate' or sources are not defined. Defaulting to 1 worker.")
exec_control['workers'] = 1
display_effective_policy(
policy,
policy_name,
sources=sources,
profile_names=list(profile_tasks.keys()) if profile_tasks is not None else None,
original_workers_setting=original_workers_setting
)
if args.dry_run:
logger.info("Dry run complete. Exiting.")
return 0
start_time = time.time()
run_until_cfg = exec_control.get('run_until', {})
duration_seconds = (run_until_cfg.get('minutes') or 0) * 60
max_cycles = run_until_cfg.get('cycles') or 0
max_requests = run_until_cfg.get('requests') or 0
# --- Main test loop ---
cycles = 0
try:
def process_task(source, source_index, cycle_num):
"""Worker task for one source (URL or file path)."""
try:
if shutdown_event.is_set():
return [] # Shutdown initiated, do not start new work
# --- Step 1: Get info.json content ---
info_json_content = None
if mode in ['full_stack', 'fetch_only']:
gen_policy = policy.get('info_json_generation_policy', {})
cmd_template = gen_policy.get('command_template')
# --- Profile Generation ---
profile_name = None
profile_mode = settings.get('profile_mode')
pm_policy = settings.get('profile_management')
if profile_mode == 'per_worker_with_rotation':
if not pm_policy:
logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration.")
# Log a failure event and skip
event = {'type': 'fetch', 'path': str(source), 'success': False, 'error_type': 'ConfigError', 'details': 'Missing profile_management section'}
state_manager.log_event(event)
return []
worker_id = get_worker_id()
profile_name = state_manager.get_or_rotate_worker_profile(worker_id, policy)
elif pm_policy:
# This is the existing dynamic cooldown logic
profile_name = state_manager.get_next_available_profile(policy)
if not profile_name:
logger.warning("No available profiles to run task. Skipping.")
return []
else:
# This is the legacy logic
profile_prefix = settings.get('profile_prefix')
if profile_prefix:
if profile_mode == 'per_request':
timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')
profile_name = f"{profile_prefix}_{timestamp}_{source_index}"
elif profile_mode == 'per_worker':
worker_index = get_worker_id()
profile_name = f"{profile_prefix}_{worker_index}"
else: # Default to pool logic
profile_pool = settings.get('profile_pool')
if profile_pool:
profile_name = f"{profile_prefix}_{source_index % profile_pool}"
else:
profile_name = "default" # A final fallback
# --- Rate Limit Check ---
if not state_manager.check_and_update_rate_limit(profile_name, policy):
return [] # Rate limited, skip this task
# --- Command Generation ---
gen_cmd = []
save_dir = settings.get('save_info_json_dir')
save_path = None
if cmd_template:
# Low-level template mode. The user is responsible for output.
video_id = get_video_id(source)
# A heuristic to add '--' if the video ID looks like an option.
# We split the template, find the standalone '{url}' placeholder,
# and insert '--' before it. This assumes it's a positional argument.
template_parts = shlex.split(cmd_template)
try:
# Find from the end, in case it's used in an option value earlier.
url_index = len(template_parts) - 1 - template_parts[::-1].index('{url}')
if video_id.startswith('-'):
template_parts.insert(url_index, '--')
except ValueError:
# '{url}' not found as a standalone token, do nothing special.
pass
# Rejoin and then format the whole string.
gen_cmd_str = ' '.join(template_parts)
gen_cmd_str = gen_cmd_str.format(url=video_id, profile=profile_name)
gen_cmd = shlex.split(gen_cmd_str)
if args.verbose and '--verbose' not in gen_cmd:
gen_cmd.append('--verbose')
else:
# High-level policy mode. Orchestrator builds the command.
script_cmd_str = settings.get('info_json_script')
if not script_cmd_str:
logger.error("High-level policy requires 'settings.info_json_script'.")
return []
gen_cmd = shlex.split(script_cmd_str)
video_id = get_video_id(source)
client_to_use, request_params = state_manager.get_client_for_request(profile_name, gen_policy)
# --- Multi-Cookie File Logic ---
if pm_policy:
cookie_files = pm_policy.get('cookie_files')
if cookie_files and isinstance(cookie_files, list) and len(cookie_files) > 0:
profile_index = -1
# Extract index from profile name. Matches _<index> or _<worker_id>_<gen>
match = re.search(r'_(\d+)(?:_(\d+))?$', profile_name)
if match:
# For rotation mode, the first group is worker_id. For pool mode, it's the profile index.
profile_index = int(match.group(1))
if profile_index != -1:
cookie_file_path = cookie_files[profile_index % len(cookie_files)]
if not request_params:
request_params = {}
request_params['cookies_file_path'] = cookie_file_path
logger.info(f"[{source}] Assigned cookie file '{os.path.basename(cookie_file_path)}' to profile '{profile_name}'")
else:
logger.warning(f"[{source}] Could not determine index for profile '{profile_name}' to assign cookie file.")
if client_to_use:
gen_cmd.extend(['--client', str(client_to_use)])
if gen_policy.get('auth_host'):
gen_cmd.extend(['--auth-host', str(gen_policy.get('auth_host'))])
if gen_policy.get('auth_port'):
gen_cmd.extend(['--auth-port', str(gen_policy.get('auth_port'))])
if profile_name != "default":
gen_cmd.extend(['--profile', profile_name])
# Add --print-proxy so we can track it for stats
if '--print-proxy' not in gen_cmd:
gen_cmd.append('--print-proxy')
if request_params:
gen_cmd.extend(['--request-params-json', json.dumps(request_params)])
if gen_policy.get('assigned_proxy_url'):
gen_cmd.extend(['--assigned-proxy-url', str(gen_policy.get('assigned_proxy_url'))])
if gen_policy.get('proxy_rename'):
gen_cmd.extend(['--proxy-rename', str(gen_policy.get('proxy_rename'))])
if args.verbose:
gen_cmd.append('--verbose')
# If saving is enabled, delegate saving to the client script.
if save_dir:
try:
os.makedirs(save_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Note: Using a timestamped filename to avoid race conditions.
filename = f"{timestamp}-{video_id}-{profile_name}.json"
save_path = Path(save_dir) / filename
gen_cmd.extend(['--output', str(save_path)])
# No longer need to suppress, it's the default.
except IOError as e:
logger.error(f"[{source}] Could not prepare save path in '{save_dir}': {e}")
# Continue without saving
save_path = None
# If not saving to a file, we need the output on stdout for the download step.
if not save_dir:
gen_cmd.append('--print-info-out')
# The positional video_id argument must come after all options.
# Use '--' to ensure it's not parsed as an option if it starts with a dash.
if video_id.startswith('-'):
gen_cmd.append('--')
gen_cmd.append(video_id)
worker_id = get_worker_id()
profile_log_part = f" [Profile: {profile_name}]" if profile_name else ""
logger.info(f"[Worker {worker_id}]{profile_log_part} [{source}] Running info.json command: {' '.join(shlex.quote(s) for s in gen_cmd)}")
retcode, stdout, stderr = run_command(gen_cmd)
info_json_content = stdout
# --- Extract proxy from stderr and record it for stats ---
proxy_url = None
proxy_match = re.search(r"Proxy used: (.*)", stderr)
if proxy_match:
proxy_url = proxy_match.group(1).strip()
state_manager.record_proxy_usage(proxy_url)
if retcode == 0:
# If the client script saved the file, stdout will be empty.
# If we need the content for a download step, we must read it back.
if not info_json_content.strip():
# Check stderr for the success message to confirm save.
saved_path_match = re.search(r"Successfully saved info.json to (.*)", stderr)
if saved_path_match:
output_file_str = saved_path_match.group(1).strip().strip("'\"")
logger.info(f"[{source}] -> {saved_path_match.group(0).strip()}")
# If this is a full_stack test, we need the content for the download worker.
if mode == 'full_stack':
try:
with open(output_file_str, 'r', encoding='utf-8') as f:
info_json_content = f.read()
except IOError as e:
logger.error(f"Could not read back info.json from '{output_file_str}': {e}")
retcode = -1 # Treat as failure
elif save_path:
# Command was told to save, but didn't confirm. Assume it worked if exit code is 0.
logger.info(f"[{source}] -> Client script exited 0, assuming info.json was saved to '{save_path}'")
if mode == 'full_stack':
try:
with open(save_path, 'r', encoding='utf-8') as f:
info_json_content = f.read()
except IOError as e:
logger.error(f"Could not read back info.json from '{save_path}': {e}")
retcode = -1
# If stdout is empty and we weren't saving, it's an issue.
elif not save_path and not cmd_template:
logger.error(f"[{source}] info.json generation gave no stdout and was not asked to save to a file.")
retcode = -1
else:
logger.info(f"[{source}] -> Successfully fetched info.json to memory/stdout.")
event = {'type': 'fetch', 'path': str(source), 'profile': profile_name}
if proxy_url:
event['proxy_url'] = proxy_url
if retcode != 0:
error_lines = [line for line in stderr.strip().split('\n') if 'error' in line.lower()]
error_msg = error_lines[-1] if error_lines else stderr.strip().split('\n')[-1]
logger.error(f"[{source}] Failed to generate info.json: {error_msg}")
event.update({'success': False, 'error_type': 'GetInfoJsonFail', 'details': error_msg})
state_manager.log_event(event)
return []
# Check for quality degradation before logging success
s_conditions = policy.get('stop_conditions', {})
quality_policy = s_conditions.get('on_quality_degradation')
if quality_policy and info_json_content:
try:
info_data = json.loads(info_json_content)
available_formats = {f.get('format_id') for f in info_data.get('formats', [])}
required_formats = quality_policy.get('trigger_if_missing_formats')
if required_formats:
# Can be a single string, a comma-separated string, or a list of strings.
if isinstance(required_formats, str):
required_formats = [f.strip() for f in required_formats.split(',')]
missing_formats = [f for f in required_formats if f not in available_formats]
if missing_formats:
logger.warning(f"[{source}] Quality degradation detected. Missing required formats: {', '.join(missing_formats)}.")
event['quality_degradation_trigger'] = True
event['missing_formats'] = missing_formats
except (json.JSONDecodeError, TypeError):
logger.warning(f"[{source}] Could not parse info.json or find formats to check for quality degradation.")
# Record request for profile cooldown policy if active
if pm_policy:
state_manager.record_profile_request(profile_name)
state_manager.increment_request_count()
event.update({'success': True, 'details': 'OK'})
state_manager.log_event(event)
# Saving is now delegated to the client script when a save_dir is provided.
# The orchestrator no longer saves the file itself.
elif mode == 'download_only':
# This path is for non-profile-grouped download_only mode.
try:
with open(source, 'r', encoding='utf-8') as f:
info_json_content = f.read()
except (IOError, FileNotFoundError) as e:
logger.error(f"[{get_display_name(source)}] Could not read info.json file: {e}")
return []
if mode != 'fetch_only':
return _run_download_logic(source, info_json_content, policy, state_manager, profile_name=profile_name)
return []
finally:
# Sleep after the task is completed to space out requests from this worker.
exec_control = policy.get('execution_control', {})
sleep_cfg = exec_control.get('sleep_between_tasks', {})
sleep_min = sleep_cfg.get('min_seconds', 0)
if sleep_min > 0:
sleep_max = sleep_cfg.get('max_seconds') or sleep_min
if sleep_max > sleep_min:
sleep_duration = random.uniform(sleep_min, sleep_max)
else:
sleep_duration = sleep_min
logger.debug(f"Worker sleeping for {sleep_duration:.2f}s after task for {get_display_name(source)}.")
# Interruptible sleep
sleep_end_time = time.time() + sleep_duration
while time.time() < sleep_end_time:
if shutdown_event.is_set():
break
time.sleep(0.2)
while not shutdown_event.is_set():
if duration_seconds and (time.time() - start_time) > duration_seconds:
logger.info("Reached duration limit. Stopping.")
break
if max_requests > 0 and state_manager.get_request_count() >= max_requests:
logger.info(f"Reached max requests ({max_requests}). Stopping.")
break
# --- Rescan for sources if in continuous download mode ---
if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous':
info_json_dir = settings.get('info_json_dir')
try:
all_files_in_dir = Path(info_json_dir).glob('*.json')
processed_files = state_manager.get_processed_files()
new_files = [f for f in all_files_in_dir if str(f) not in processed_files]
# Sort by modification time, oldest first, to process in order of creation
new_files.sort(key=os.path.getmtime)
max_files_per_cycle = settings.get('max_files_per_cycle')
if max_files_per_cycle and len(new_files) > max_files_per_cycle:
sources = new_files[:max_files_per_cycle]
else:
sources = new_files
if not sources:
sleep_duration = settings.get('sleep_if_no_new_files_seconds', 10)
logger.info(f"No new info.json files found in '{info_json_dir}'. Sleeping for {sleep_duration}s...")
# Interruptible sleep
sleep_end_time = time.time() + sleep_duration
while time.time() < sleep_end_time:
if shutdown_event.is_set():
break
time.sleep(0.5)
if shutdown_event.is_set():
break
continue # Skip to next iteration of the while loop
except (IOError, FileNotFoundError) as e:
logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}. Retrying in 10s.")
time.sleep(10)
continue
cycles += 1
if max_cycles > 0 and cycles > max_cycles:
logger.info(f"Reached max cycles ({max_cycles}). Stopping.")
break
logger.info(f"--- Cycle #{cycles} (Total Requests: {state_manager.get_request_count()}) ---")
with concurrent.futures.ThreadPoolExecutor(max_workers=exec_control.get('workers', 1)) as executor:
if mode == 'download_only' and profile_tasks is not None:
# New: submit profile tasks
future_to_source = {
executor.submit(process_profile_task, profile_name, file_list, policy, state_manager, cycles): profile_name
for profile_name, file_list in task_items
}
else:
# Old: submit individual file/url tasks
future_to_source = {
executor.submit(process_task, source, i, cycles): source
for i, source in enumerate(task_items)
}
should_stop = False
pending_futures = set(future_to_source.keys())
while pending_futures and not should_stop:
done, pending_futures = concurrent.futures.wait(
pending_futures, return_when=concurrent.futures.FIRST_COMPLETED
)
for future in done:
if shutdown_event.is_set():
should_stop = True
break
source = future_to_source[future]
try:
results = future.result()
# Mark file as processed in continuous download mode
if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous':
state_manager.mark_file_as_processed(source)
if settings.get('mark_processed_files'):
try:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
new_path = source.parent / f"{source.name}.{timestamp}.processed"
source.rename(new_path)
logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'")
except (IOError, OSError) as e:
logger.error(f"Failed to rename processed file '{source.name}': {e}")
for result in results:
if not result['success']:
s_conditions = policy.get('stop_conditions', {})
is_cumulative_403_active = s_conditions.get('on_cumulative_403', {}).get('max_errors')
if s_conditions.get('on_failure') or \
(s_conditions.get('on_http_403') and not is_cumulative_403_active and result['error_type'] == 'HTTP 403') or \
(s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'):
logger.info(f"!!! STOP CONDITION MET: Immediate stop on failure '{result['error_type']}' for {get_display_name(source)}. Shutting down all workers. !!!")
should_stop = True
break
except concurrent.futures.CancelledError:
logger.info(f"Task for {get_display_name(source)} was cancelled during shutdown.")
event = {
'type': 'fetch' if mode != 'download_only' else 'download',
'path': str(source),
'success': False,
'error_type': 'Cancelled',
'details': 'Task cancelled during shutdown.'
}
state_manager.log_event(event)
except Exception as exc:
logger.error(f'{get_display_name(source)} generated an exception: {exc}')
if should_stop:
break
# Check for cumulative error rate stop conditions
s_conditions = policy.get('stop_conditions', {})
error_rate_policy = s_conditions.get('on_error_rate')
if error_rate_policy and not should_stop:
max_errors = error_rate_policy.get('max_errors')
per_minutes = error_rate_policy.get('per_minutes')
if max_errors and per_minutes:
error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes)
if error_count > 0:
logger.info(f"!!! STOP CONDITION MET: Error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!")
should_stop = True
cumulative_403_policy = s_conditions.get('on_cumulative_403')
if cumulative_403_policy and not should_stop:
max_errors = cumulative_403_policy.get('max_errors')
per_minutes = cumulative_403_policy.get('per_minutes')
if max_errors and per_minutes:
error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes, error_type='HTTP 403')
if error_count > 0:
logger.info(f"!!! STOP CONDITION MET: Cumulative 403 error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!")
should_stop = True
quality_degradation_policy = s_conditions.get('on_quality_degradation')
if quality_degradation_policy and not should_stop:
max_triggers = quality_degradation_policy.get('max_triggers')
per_minutes = quality_degradation_policy.get('per_minutes')
if max_triggers and per_minutes:
trigger_count = state_manager.check_quality_degradation_rate(max_triggers, per_minutes)
if trigger_count > 0:
logger.info(f"!!! STOP CONDITION MET: Quality degradation triggered {trigger_count} times in the last {per_minutes} minute(s). Shutting down. !!!")
should_stop = True
if should_stop:
break
# Check for duration limit after each task completes
if duration_seconds and (time.time() - start_time) > duration_seconds:
logger.info("Reached duration limit. Cancelling remaining tasks.")
should_stop = True
if should_stop and pending_futures:
logger.info(f"Cancelling {len(pending_futures)} outstanding task(s).")
for future in pending_futures:
future.cancel()
if should_stop: break
if max_cycles > 0 and cycles >= max_cycles:
break
logger.info("Cycle complete.")
except KeyboardInterrupt:
logger.info("\nForceful shutdown requested...")
finally:
state_manager.print_summary(policy)
state_manager.close()
return 0