#!/usr/bin/env python3 """ Policy-driven stress-testing orchestrator for video format downloads. """ import argparse import collections import collections.abc import concurrent.futures import json import logging import os import random import re import shlex import signal import subprocess import sys import threading import time from copy import deepcopy from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse, parse_qs try: import yaml except ImportError: print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr) sys.exit(1) # Add a global event for graceful shutdown shutdown_event = threading.Event() # Globals for tracking and terminating subprocesses on shutdown running_processes = set() process_lock = threading.Lock() # Globals for assigning a stable ID to each worker thread worker_id_map = {} worker_id_counter = 0 worker_id_lock = threading.Lock() # Configure logging logger = logging.getLogger('stress_policy_tool') def get_worker_id(): """Assigns a stable, sequential ID to each worker thread.""" global worker_id_counter thread_id = threading.get_ident() with worker_id_lock: if thread_id not in worker_id_map: worker_id_map[thread_id] = worker_id_counter worker_id_counter += 1 return worker_id_map[thread_id] def get_video_id(url: str) -> str: """Extracts a YouTube video ID from a URL.""" match = re.search(r"v=([0-9A-Za-z_-]{11})", url) if match: return match.group(1) match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url) if match: return match.group(1) if re.fullmatch(r'[0-9A-Za-z_-]{11}', url): return url return "unknown_video_id" def get_display_name(path_or_url): """Returns a clean name for logging, either a filename or a video ID.""" if isinstance(path_or_url, Path): return path_or_url.name path_str = str(path_or_url) video_id = get_video_id(path_str) if video_id != "unknown_video_id": return video_id return Path(path_str).name def format_size(b): """Format size in bytes to human-readable string.""" if b is None: return 'N/A' if b < 1024: return f"{b}B" elif b < 1024**2: return f"{b/1024:.2f}KiB" elif b < 1024**3: return f"{b/1024**2:.2f}MiB" else: return f"{b/1024**3:.2f}GiB" def flatten_dict(d, parent_key='', sep='.'): """Flattens a nested dictionary.""" items = {} for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k if isinstance(v, collections.abc.MutableMapping): items.update(flatten_dict(v, new_key, sep=sep)) else: items[new_key] = v return items def print_policy_overrides(policy): """Prints all policy values as a single-line of --set arguments.""" # We don't want to include the 'name' key in the overrides. policy_copy = deepcopy(policy) policy_copy.pop('name', None) flat_policy = flatten_dict(policy_copy) set_args = [] for key, value in sorted(flat_policy.items()): if value is None: value_str = 'null' elif isinstance(value, bool): value_str = str(value).lower() elif isinstance(value, (list, dict)): # Use compact JSON for lists/dicts value_str = json.dumps(value, separators=(',', ':')) else: value_str = str(value) # Use shlex.quote to handle spaces and special characters safely set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}") print(' '.join(set_args)) def get_profile_from_filename(path, regex_pattern): """Extracts a profile name from a filename using a regex.""" if not regex_pattern: return None match = re.search(regex_pattern, path.name) if match: # Assume the first capturing group is the profile name if match.groups(): return match.group(1) return None class StateManager: """Tracks statistics, manages rate limits, and persists state across runs.""" def __init__(self, policy_name): self.state_file_path = Path(f"{policy_name}_state.json") self.stats_file_path = Path(f"{policy_name}_stats.jsonl") self.lock = threading.RLock() self.start_time = time.time() self.events = [] self.state = { 'global_request_count': 0, 'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]} 'profile_request_counts': {}, # for client rotation 'profile_last_refresh_time': {}, # for client rotation 'proxy_last_finish_time': {}, # for per-proxy sleep 'processed_files': [], # For continuous download_only mode # For dynamic profile cooldown strategy 'profile_cooldown_counts': {}, 'profile_cooldown_sleep_until': {}, 'profile_pool_size': 0, 'profile_run_suffix': None, 'worker_profile_generations': {} } self.stats_file_handle = None self._load_state() self.print_historical_summary() self._open_stats_log() def _load_state(self): if not self.state_file_path.exists(): logger.info(f"State file not found at '{self.state_file_path}', starting fresh.") return try: with open(self.state_file_path, 'r', encoding='utf-8') as f: self.state = json.load(f) # Ensure keys exist self.state.setdefault('global_request_count', 0) self.state.setdefault('rate_limit_trackers', {}) self.state.setdefault('profile_request_counts', {}) self.state.setdefault('profile_last_refresh_time', {}) self.state.setdefault('proxy_last_finish_time', {}) self.state.setdefault('processed_files', []) # For dynamic profile cooldown strategy self.state.setdefault('profile_cooldown_counts', {}) self.state.setdefault('profile_cooldown_sleep_until', {}) self.state.setdefault('profile_pool_size', 0) self.state.setdefault('profile_run_suffix', None) self.state.setdefault('worker_profile_generations', {}) logger.info(f"Loaded state from {self.state_file_path}") except (IOError, json.JSONDecodeError) as e: logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.") def _save_state(self): with self.lock: try: with open(self.state_file_path, 'w', encoding='utf-8') as f: json.dump(self.state, f, indent=2) logger.info(f"Saved state to {self.state_file_path}") except IOError as e: logger.error(f"Could not save state to {self.state_file_path}: {e}") def _open_stats_log(self): try: self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8') except IOError as e: logger.error(f"Could not open stats file {self.stats_file_path}: {e}") def close(self): """Saves state and closes file handles.""" self._save_state() if self.stats_file_handle: self.stats_file_handle.close() self.stats_file_handle = None def mark_file_as_processed(self, file_path): """Adds a file path to the list of processed files in the state.""" with self.lock: # Using a list and checking for existence is fine for moderate numbers of files. # A set isn't JSON serializable. processed = self.state.setdefault('processed_files', []) file_str = str(file_path) if file_str not in processed: processed.append(file_str) def get_processed_files(self): """Returns a set of file paths that have been processed.""" with self.lock: return set(self.state.get('processed_files', [])) def print_historical_summary(self): """Prints a summary based on the state loaded from disk, before new events.""" with self.lock: now = time.time() rate_trackers = self.state.get('rate_limit_trackers', {}) total_requests = self.state.get('global_request_count', 0) if not rate_trackers and not total_requests: logger.info("No historical data found in state file.") return logger.info("\n--- Summary From Previous Runs ---") logger.info(f"Total info.json requests (all previous runs): {total_requests}") if rate_trackers: for key, timestamps in sorted(rate_trackers.items()): # Time windows in seconds windows = { 'last 10 min': 600, 'last 60 min': 3600, 'last 6 hours': 21600, 'last 24 hours': 86400 } rates_str_parts = [] for name, seconds in windows.items(): count = sum(1 for ts in timestamps if now - ts <= seconds) # Calculate rate in requests per minute rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)") logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts)) logger.info("------------------------------------") def log_event(self, event_data): with self.lock: event_data['timestamp'] = datetime.now().isoformat() self.events.append(event_data) if self.stats_file_handle: self.stats_file_handle.write(json.dumps(event_data) + '\n') self.stats_file_handle.flush() def get_request_count(self): with self.lock: return self.state.get('global_request_count', 0) def increment_request_count(self): with self.lock: self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1 def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None): """ Checks if a cumulative error rate has been exceeded. If error_type is None, checks for any failure. Returns the number of errors found if the threshold is met, otherwise 0. """ with self.lock: now = time.time() window_seconds = per_minutes * 60 if error_type: recent_errors = [ e for e in self.events if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] else: # Generic failure check recent_errors = [ e for e in self.events if not e.get('success') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] if len(recent_errors) >= max_errors: return len(recent_errors) return 0 def check_quality_degradation_rate(self, max_triggers, per_minutes): """ Checks if the quality degradation trigger rate has been exceeded. Returns the number of triggers found if the threshold is met, otherwise 0. """ with self.lock: now = time.time() window_seconds = per_minutes * 60 recent_triggers = [ e for e in self.events if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] if len(recent_triggers) >= max_triggers: return len(recent_triggers) return 0 def check_and_update_rate_limit(self, profile_name, policy): """ Checks if a request is allowed based on policy rate limits. If allowed, updates the internal state. Returns True if allowed, False otherwise. """ with self.lock: now = time.time() gen_policy = policy.get('info_json_generation_policy', {}) rate_limits = gen_policy.get('rate_limits', {}) # Check per-IP limit ip_limit = rate_limits.get('per_ip') if ip_limit: tracker_key = 'per_ip' max_req = ip_limit.get('max_requests') period_min = ip_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) # Filter out old timestamps timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning("Per-IP rate limit reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # Check per-profile limit profile_limit = rate_limits.get('per_profile') if profile_limit and profile_name: tracker_key = f"profile_{profile_name}" max_req = profile_limit.get('max_requests') period_min = profile_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # If all checks pass, record the new request timestamp for all relevant trackers if ip_limit and ip_limit.get('max_requests'): self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now) if profile_limit and profile_limit.get('max_requests') and profile_name: self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now) return True def get_client_for_request(self, profile_name, gen_policy): """ Determines which client to use based on the client_rotation_policy. Returns a tuple: (client_name, request_params_dict). """ with self.lock: rotation_policy = gen_policy.get('client_rotation_policy') # If no rotation policy, use the simple 'client' key. if not rotation_policy: client = gen_policy.get('client') logger.info(f"Using client '{client}' for profile '{profile_name}'.") req_params = gen_policy.get('request_params') return client, req_params # --- Rotation logic --- now = time.time() major_client = rotation_policy.get('major_client') refresh_client = rotation_policy.get('refresh_client') refresh_every = rotation_policy.get('refresh_every', {}) if not refresh_client or not refresh_every: return major_client, rotation_policy.get('major_client_params') should_refresh = False # Check time-based refresh refresh_minutes = refresh_every.get('minutes') last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0) if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60): should_refresh = True # Check request-count-based refresh refresh_requests = refresh_every.get('requests') request_count = self.state['profile_request_counts'].get(profile_name, 0) if refresh_requests and request_count >= refresh_requests: should_refresh = True if should_refresh: logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.") self.state['profile_last_refresh_time'][profile_name] = now self.state['profile_request_counts'][profile_name] = 0 # Reset counter return refresh_client, rotation_policy.get('refresh_client_params') else: # Not refreshing, so increment request count for this profile self.state['profile_request_counts'][profile_name] = request_count + 1 return major_client, rotation_policy.get('major_client_params') def get_next_available_profile(self, policy): """ Finds or creates an available profile based on the dynamic cooldown policy. Returns a profile name, or None if no profile is available. """ with self.lock: now = time.time() settings = policy.get('settings', {}) pm_policy = settings.get('profile_management') if not pm_policy: return None prefix = pm_policy.get('prefix') if not prefix: logger.error("Profile management policy requires 'prefix'.") return None # Determine and persist the suffix for this run to ensure profile names are stable run_suffix = self.state.get('profile_run_suffix') if not run_suffix: suffix_config = pm_policy.get('suffix') if suffix_config == 'auto': run_suffix = datetime.now().strftime('%Y%m%d%H%M') else: run_suffix = suffix_config or '' self.state['profile_run_suffix'] = run_suffix # Initialize pool size from policy if not already in state if self.state.get('profile_pool_size', 0) == 0: self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1) max_reqs = pm_policy.get('max_requests_per_profile') sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion') # Loop until a profile is found or we decide we can't find one while True: # Try to find an existing, available profile for i in range(self.state['profile_pool_size']): profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}" # Check if sleeping sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0) if now < sleep_until: continue # Still sleeping # Check if it needs to be put to sleep req_count = self.state['profile_cooldown_counts'].get(profile_name, 0) if max_reqs and req_count >= max_reqs: sleep_duration_seconds = (sleep_mins or 0) * 60 self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.") continue # Now sleeping, try next profile # This profile is available logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).") return profile_name # If we get here, no existing profile was available if pm_policy.get('auto_expand_pool'): new_profile_index = self.state['profile_pool_size'] self.state['profile_pool_size'] += 1 profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}" logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'") return profile_name else: # No available profiles and pool expansion is disabled return None def get_or_rotate_worker_profile(self, worker_id, policy): """ Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met. This is used by the 'per_worker_with_rotation' profile mode. """ with self.lock: pm_policy = policy.get('settings', {}).get('profile_management', {}) if not pm_policy: logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.") return f"error_profile_{worker_id}" prefix = pm_policy.get('prefix') if not prefix: logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.") return f"error_profile_{worker_id}" max_reqs = pm_policy.get('max_requests_per_profile') generations = self.state.setdefault('worker_profile_generations', {}) # worker_id is an int, but JSON keys must be strings worker_id_str = str(worker_id) current_gen = generations.get(worker_id_str, 0) profile_name = f"{prefix}_{worker_id}_{current_gen}" if not max_reqs: # No lifetime limit defined, so never rotate. return profile_name req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0) if req_count >= max_reqs: logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.") new_gen = current_gen + 1 generations[worker_id_str] = new_gen # The request counts for the old profile are implicitly left behind. # The new profile will start with a count of 0. profile_name = f"{prefix}_{worker_id}_{new_gen}" return profile_name def record_profile_request(self, profile_name): """Increments the request counter for a profile for the cooldown policy.""" with self.lock: if not profile_name: return counts = self.state.setdefault('profile_cooldown_counts', {}) counts[profile_name] = counts.get(profile_name, 0) + 1 def record_proxy_usage(self, proxy_url): """Records a request timestamp for a given proxy URL for statistical purposes.""" if not proxy_url: return with self.lock: now = time.time() # Use a prefix to avoid collisions with profile names or other keys tracker_key = f"proxy_{proxy_url}" self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now) def check_and_update_download_rate_limit(self, proxy_url, policy): """Checks download rate limits. Returns True if allowed, False otherwise.""" with self.lock: now = time.time() d_policy = policy.get('download_policy', {}) rate_limits = d_policy.get('rate_limits', {}) # Check per-IP limit ip_limit = rate_limits.get('per_ip') if ip_limit: tracker_key = 'download_per_ip' # Use a distinct key max_req = ip_limit.get('max_requests') period_min = ip_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning("Per-IP download rate limit reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # Check per-proxy limit proxy_limit = rate_limits.get('per_proxy') if proxy_limit and proxy_url: tracker_key = f"download_proxy_{proxy_url}" max_req = proxy_limit.get('max_requests') period_min = proxy_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # If all checks pass, record the new request timestamp for all relevant trackers if ip_limit and ip_limit.get('max_requests'): self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now) if proxy_limit and proxy_limit.get('max_requests') and proxy_url: self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now) return True def wait_for_proxy_cooldown(self, proxy_url, policy): """If a per-proxy sleep is defined, wait until the cooldown period has passed.""" with self.lock: d_policy = policy.get('download_policy', {}) sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0) if not proxy_url or not sleep_duration > 0: return last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0) elapsed = time.time() - last_finish if elapsed < sleep_duration: time_to_sleep = sleep_duration - elapsed logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.") # Interruptible sleep sleep_end_time = time.time() + time_to_sleep while time.time() < sleep_end_time: if shutdown_event.is_set(): logger.info("Shutdown requested during proxy cooldown sleep.") break time.sleep(0.2) def update_proxy_finish_time(self, proxy_url): """Updates the last finish time for a proxy.""" with self.lock: if not proxy_url: return self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time() def print_summary(self, policy=None): """Print a summary of the test run.""" with self.lock: # --- Cumulative Stats from State --- now = time.time() rate_trackers = self.state.get('rate_limit_trackers', {}) if rate_trackers: logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---") logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.") fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')} download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')} def print_tracker_stats(trackers, tracker_type): if not trackers: logger.info(f"No historical {tracker_type} trackers found.") return logger.info(f"Historical {tracker_type} Trackers:") for key, timestamps in sorted(trackers.items()): windows = { 'last 10 min': 600, 'last 60 min': 3600, 'last 6 hours': 21600, 'last 24 hours': 86400 } rates_str_parts = [] for name, seconds in windows.items(): count = sum(1 for ts in timestamps if now - ts <= seconds) rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)") # Clean up key for display display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips') logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts)) print_tracker_stats(fetch_trackers, "Fetch Request") print_tracker_stats(download_trackers, "Download Attempt") if not self.events: logger.info("\nNo new events were recorded in this session.") return duration = time.time() - self.start_time fetch_events = [e for e in self.events if e.get('type') == 'fetch'] download_events = [e for e in self.events if e.get('type') != 'fetch'] logger.info("\n--- Test Summary (This Run) ---") logger.info(f"Total duration: {duration:.2f} seconds") logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}") if policy: logger.info("\n--- Test Configuration ---") settings = policy.get('settings', {}) d_policy = policy.get('download_policy', {}) if settings.get('urls_file'): logger.info(f"URL source file: {settings['urls_file']}") if settings.get('info_json_dir'): logger.info(f"Info.json source dir: {settings['info_json_dir']}") if d_policy: logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}") if d_policy.get('downloader'): logger.info(f"Downloader: {d_policy.get('downloader')}") if d_policy.get('downloader_args'): logger.info(f"Downloader args: {d_policy.get('downloader_args')}") if d_policy.get('pause_before_download_seconds'): logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s") if d_policy.get('sleep_between_formats'): sleep_cfg = d_policy.get('sleep_between_formats') logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s") if fetch_events: total_fetches = len(fetch_events) successful_fetches = sum(1 for e in fetch_events if e['success']) cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled') failed_fetches = total_fetches - successful_fetches - cancelled_fetches logger.info("\n--- Fetch Summary (This Run) ---") logger.info(f"Total info.json fetch attempts: {total_fetches}") logger.info(f" - Successful: {successful_fetches}") logger.info(f" - Failed: {failed_fetches}") if cancelled_fetches > 0: logger.info(f" - Cancelled: {cancelled_fetches}") completed_fetches = successful_fetches + failed_fetches if completed_fetches > 0: success_rate = (successful_fetches / completed_fetches) * 100 logger.info(f"Success rate (of completed): {success_rate:.2f}%") elif total_fetches > 0: logger.info("Success rate: N/A (no tasks completed)") if duration > 1 and total_fetches > 0: rpm = (total_fetches / duration) * 60 logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute") if failed_fetches > 0: error_counts = collections.Counter( e.get('error_type', 'Unknown') for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled' ) logger.info("Failure breakdown:") for error_type, count in sorted(error_counts.items()): logger.info(f" - {error_type}: {count}") profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile')) if profile_counts: logger.info("Requests per profile:") for profile, count in sorted(profile_counts.items()): logger.info(f" - {profile}: {count}") proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url')) if proxy_counts: logger.info("Requests per proxy:") for proxy, count in sorted(proxy_counts.items()): logger.info(f" - {proxy}: {count}") if download_events: total_attempts = len(download_events) successes = sum(1 for e in download_events if e['success']) cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled') failures = total_attempts - successes - cancelled # --- Profile Association for Download Events --- download_profiles = [e.get('profile') for e in download_events] # For download_only mode, we might need to fall back to regex extraction # if the profile wasn't passed down (e.g., no profile grouping). profile_regex = None if policy: settings = policy.get('settings', {}) if settings.get('mode') == 'download_only': profile_regex = settings.get('profile_extraction_regex') if profile_regex: for i, e in enumerate(download_events): if not download_profiles[i]: # If profile wasn't set in the event path = Path(e.get('path', '')) match = re.search(profile_regex, path.name) if match and match.groups(): download_profiles[i] = match.group(1) # Replace any remaining Nones with 'unknown_profile' download_profiles = [p or 'unknown_profile' for p in download_profiles] num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile')) logger.info("\n--- Download Summary (This Run) ---") if policy: workers = policy.get('execution_control', {}).get('workers', 'N/A') logger.info(f"Workers configured: {workers}") logger.info(f"Profiles utilized for downloads: {num_profiles_used}") logger.info(f"Total download attempts: {total_attempts}") logger.info(f" - Successful: {successes}") logger.info(f" - Failed: {failures}") if cancelled > 0: logger.info(f" - Cancelled: {cancelled}") completed_downloads = successes + failures if completed_downloads > 0: success_rate = (successes / completed_downloads) * 100 logger.info(f"Success rate (of completed): {success_rate:.2f}%") elif total_attempts > 0: logger.info("Success rate: N/A (no tasks completed)") duration_hours = duration / 3600.0 if duration > 1 and total_attempts > 0: dpm = (total_attempts / duration) * 60 logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute") total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success']) if total_bytes > 0: logger.info(f"Total data downloaded: {format_size(total_bytes)}") if failures > 0: error_counts = collections.Counter( e.get('error_type', 'Unknown') for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled' ) logger.info("Failure breakdown:") for error_type, count in sorted(error_counts.items()): logger.info(f" - {error_type}: {count}") # Add profile to each download event for easier counting for i, e in enumerate(download_events): e['profile'] = download_profiles[i] profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile')) if profile_counts: logger.info("Downloads per profile:") for profile, count in sorted(profile_counts.items()): rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url')) if proxy_counts: logger.info("Downloads per proxy:") for proxy, count in sorted(proxy_counts.items()): rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") logger.info("--------------------") def _run_download_logic(source, info_json_content, policy, state_manager, profile_name=None): """Shared download logic for a single info.json.""" proxy_url = None if info_json_content: try: info_data = json.loads(info_json_content) proxy_url = info_data.get('_proxy_url') except (json.JSONDecodeError, AttributeError): logger.warning(f"[{get_display_name(source)}] Could not parse info.json to get proxy for download controls.") if not state_manager.check_and_update_download_rate_limit(proxy_url, policy): return [] state_manager.wait_for_proxy_cooldown(proxy_url, policy) results = process_info_json_cycle(source, info_json_content, policy, state_manager, proxy_url=proxy_url, profile_name=profile_name) state_manager.update_proxy_finish_time(proxy_url) return results def process_profile_task(profile_name, file_list, policy, state_manager, cycle_num): """Worker task for a profile, processing its files sequentially.""" logger.info(f"Worker {get_worker_id()} starting task for profile '{profile_name}' with {len(file_list)} files.") all_results = [] for i, file_path in enumerate(file_list): if shutdown_event.is_set(): logger.info(f"Shutdown requested, stopping task for profile '{profile_name}'.") break try: with open(file_path, 'r', encoding='utf-8') as f: info_json_content = f.read() except (IOError, FileNotFoundError) as e: logger.error(f"[{get_display_name(file_path)}] Could not read info.json file: {e}") continue # Skip this file results_for_file = _run_download_logic(file_path, info_json_content, policy, state_manager, profile_name=profile_name) all_results.extend(results_for_file) # Check for stop conditions after processing each file should_stop_profile = False for result in results_for_file: if not result['success']: s_conditions = policy.get('stop_conditions', {}) if s_conditions.get('on_failure') or \ (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): logger.info(f"Stopping further processing for profile '{profile_name}' due to failure.") should_stop_profile = True break if should_stop_profile: break # Apply sleep between tasks for this profile if i < len(file_list) - 1: exec_control = policy.get('execution_control', {}) sleep_cfg = exec_control.get('sleep_between_tasks', {}) sleep_min = sleep_cfg.get('min_seconds', 0) if sleep_min > 0: sleep_max = sleep_cfg.get('max_seconds') or sleep_min sleep_duration = random.uniform(sleep_min, sleep_max) if sleep_max > sleep_min else sleep_min logger.debug(f"Profile '{profile_name}' sleeping for {sleep_duration:.2f}s before next file.") # Interruptible sleep sleep_end_time = time.time() + sleep_duration while time.time() < sleep_end_time: if shutdown_event.is_set(): break time.sleep(0.2) return all_results def run_command(cmd, input_data=None, binary_stdout=False): """ Runs a command, captures its output, and returns status. If binary_stdout is True, stdout is returned as bytes. Otherwise, both are decoded strings. """ logger.debug(f"Running command: {' '.join(cmd)}") process = None try: # Always open in binary mode to handle both cases. We will decode later. process = subprocess.Popen( cmd, stdin=subprocess.PIPE if input_data else None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid # Start in a new process group to isolate from terminal signals ) with process_lock: running_processes.add(process) stdout_capture = [] stderr_capture = [] def read_pipe(pipe, capture_list, display_pipe=None): """Reads a pipe line by line (as bytes), appending to a list and optionally displaying.""" for line in iter(pipe.readline, b''): capture_list.append(line) if display_pipe: # Decode for display display_line = line.decode('utf-8', errors='replace') display_pipe.write(display_line) display_pipe.flush() # We must read stdout and stderr in parallel to prevent deadlocks. stdout_thread = threading.Thread(target=read_pipe, args=(process.stdout, stdout_capture)) # Display stderr in real-time as it often contains progress info. stderr_thread = threading.Thread(target=read_pipe, args=(process.stderr, stderr_capture, sys.stderr)) stdout_thread.start() stderr_thread.start() # Handle stdin after starting to read outputs to avoid deadlocks. if input_data: try: process.stdin.write(input_data.encode('utf-8')) process.stdin.close() except (IOError, BrokenPipeError): # This can happen if the process exits quickly or doesn't read stdin. logger.debug(f"Could not write to stdin for command: {' '.join(cmd)}. Process may have already exited.") # Wait for the process to finish and for all output to be read. retcode = process.wait() stdout_thread.join() stderr_thread.join() stdout_bytes = b"".join(stdout_capture) stderr_bytes = b"".join(stderr_capture) stdout = stdout_bytes if binary_stdout else stdout_bytes.decode('utf-8', errors='replace') stderr = stderr_bytes.decode('utf-8', errors='replace') return retcode, stdout, stderr except FileNotFoundError: logger.error(f"Command not found: {cmd[0]}. Make sure it's in your PATH.") return -1, "", f"Command not found: {cmd[0]}" except Exception as e: logger.error(f"An error occurred while running command: {' '.join(cmd)}. Error: {e}") return -1, "", str(e) finally: if process: with process_lock: running_processes.discard(process) def run_download_worker(info_json_path, info_json_content, format_to_download, policy, profile_name=None): """ Performs a single download attempt. Designed to be run in a worker thread. """ download_policy = policy.get('download_policy', {}) settings = policy.get('settings', {}) downloader = download_policy.get('downloader') # Get script command from settings, with fallback to download_policy for old format. script_cmd_str = settings.get('download_script') if not script_cmd_str: script_cmd_str = download_policy.get('script') if script_cmd_str: download_cmd = shlex.split(script_cmd_str) elif downloader == 'aria2c_rpc': download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'aria-rpc'] elif downloader == 'native-cli': download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'cli'] else: # Default to the new native-py downloader if downloader is 'native-py' or not specified. download_cmd = [sys.executable, '-m', 'ytops_client.cli', 'download', 'py'] download_cmd.extend(['-f', format_to_download]) if downloader == 'aria2c_rpc': if download_policy.get('aria_host'): download_cmd.extend(['--aria-host', str(download_policy['aria_host'])]) if download_policy.get('aria_port'): download_cmd.extend(['--aria-port', str(download_policy['aria_port'])]) if download_policy.get('aria_secret'): download_cmd.extend(['--aria-secret', str(download_policy['aria_secret'])]) if download_policy.get('output_dir'): download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) if download_policy.get('aria_remote_dir'): download_cmd.extend(['--remote-dir', str(download_policy['aria_remote_dir'])]) if download_policy.get('aria_fragments_dir'): download_cmd.extend(['--fragments-dir', str(download_policy['aria_fragments_dir'])]) # For stress testing, waiting is the desired default to get a success/fail result. # Allow disabling it by explicitly setting aria_wait: false in the policy. if download_policy.get('aria_wait', True): download_cmd.append('--wait') if download_policy.get('auto_merge_fragments'): download_cmd.append('--auto-merge-fragments') if download_policy.get('remove_fragments_after_merge'): download_cmd.append('--remove-fragments-after-merge') if download_policy.get('cleanup'): download_cmd.append('--cleanup') if download_policy.get('purge_on_complete'): download_cmd.append('--purge-on-complete') downloader_args = download_policy.get('downloader_args') proxy = download_policy.get('proxy') if proxy: # Note: proxy_rename is not supported for aria2c_rpc mode. proxy_arg = f"--all-proxy {shlex.quote(str(proxy))}" if downloader_args: downloader_args = f"{downloader_args} {proxy_arg}" else: downloader_args = proxy_arg if downloader_args: # For aria2c_rpc, the downloader_args value is passed directly to the script's --downloader-args option. download_cmd.extend(['--downloader-args', downloader_args]) elif downloader == 'native-cli': # This is the logic for the legacy download_tool.py (yt-dlp CLI wrapper). pause_seconds = download_policy.get('pause_before_download_seconds') if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: download_cmd.extend(['--pause', str(pause_seconds)]) if download_policy.get('continue_downloads'): download_cmd.append('--download-continue') # Add proxy if specified directly in the policy proxy = download_policy.get('proxy') if proxy: download_cmd.extend(['--proxy', str(proxy)]) proxy_rename = download_policy.get('proxy_rename') if proxy_rename: download_cmd.extend(['--proxy-rename', str(proxy_rename)]) extra_args = download_policy.get('extra_args') if extra_args: download_cmd.extend(shlex.split(extra_args)) # Note: 'downloader' here refers to yt-dlp's internal downloader, not our script. # The policy key 'external_downloader' is more clear, but we support 'downloader' for backward compatibility. ext_downloader = download_policy.get('external_downloader') or download_policy.get('downloader') if ext_downloader and ext_downloader not in ['native-cli', 'native-py', 'aria2c_rpc']: download_cmd.extend(['--downloader', str(ext_downloader)]) downloader_args = download_policy.get('downloader_args') if downloader_args: download_cmd.extend(['--downloader-args', str(downloader_args)]) if download_policy.get('merge_output_format'): download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) if download_policy.get('merge_output_format'): download_cmd.extend(['--merge-output-format', str(download_policy['merge_output_format'])]) if download_policy.get('cleanup'): download_cmd.append('--cleanup') else: # This is the default logic for the new native-py downloader. if download_policy.get('output_to_buffer'): download_cmd.append('--output-buffer') else: # --output-dir is only relevant if not outputting to buffer. if download_policy.get('output_dir'): download_cmd.extend(['--output-dir', str(download_policy['output_dir'])]) if download_policy.get('temp_path'): download_cmd.extend(['--temp-path', str(download_policy['temp_path'])]) if download_policy.get('continue_downloads'): download_cmd.append('--download-continue') pause_seconds = download_policy.get('pause_before_download_seconds') if pause_seconds and isinstance(pause_seconds, (int, float)) and pause_seconds > 0: download_cmd.extend(['--pause', str(pause_seconds)]) proxy = download_policy.get('proxy') if proxy: download_cmd.extend(['--proxy', str(proxy)]) proxy_rename = download_policy.get('proxy_rename') if proxy_rename: download_cmd.extend(['--proxy-rename', str(proxy_rename)]) extra_args = download_policy.get('extra_args') if extra_args: download_cmd.extend(['--extra-ytdlp-args', str(extra_args)]) # Pass through downloader settings for yt-dlp to use # e.g. to tell yt-dlp to use aria2c as its backend ext_downloader = download_policy.get('external_downloader') if ext_downloader: download_cmd.extend(['--downloader', str(ext_downloader)]) downloader_args = download_policy.get('downloader_args') if downloader_args: download_cmd.extend(['--downloader-args', str(downloader_args)]) worker_id = get_worker_id() display_name = get_display_name(info_json_path) profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" log_prefix = f"[Worker {worker_id}]{profile_log_part} [{display_name} @ {format_to_download}]" logger.info(f"{log_prefix} Kicking off download process...") temp_info_file_path = None try: if isinstance(info_json_path, Path) and info_json_path.exists(): # The info.json is already in a file, pass its path directly. download_cmd.extend(['--load-info-json', str(info_json_path)]) else: # The info.json content is in memory, so write it to a temporary file. import tempfile with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', encoding='utf-8') as temp_f: temp_f.write(info_json_content) temp_info_file_path = temp_f.name download_cmd.extend(['--load-info-json', temp_info_file_path]) cmd_str_for_log = ' '.join(shlex.quote(s) for s in download_cmd) logger.info(f"{log_prefix} Running download command: {cmd_str_for_log}") output_to_buffer = download_policy.get('output_to_buffer', False) retcode, stdout, stderr = run_command(download_cmd, binary_stdout=output_to_buffer) finally: if temp_info_file_path and os.path.exists(temp_info_file_path): os.unlink(temp_info_file_path) is_403_error = "HTTP Error 403" in stderr is_timeout_error = "Read timed out" in stderr output_to_buffer = download_policy.get('output_to_buffer', False) result = { 'type': 'download', 'path': str(info_json_path), 'format': format_to_download, 'success': retcode == 0, 'error_type': None, 'details': '', 'downloaded_bytes': 0, 'profile': profile_name } if retcode == 0: details_str = "OK" size_in_bytes = 0 if output_to_buffer: # The most accurate size is the length of the stdout buffer. size_in_bytes = len(stdout) # stdout is bytes details_str += f" (Buffered {format_size(size_in_bytes)})" else: size_match = re.search(r'\[download\]\s+100%\s+of\s+~?([0-9.]+)(B|KiB|MiB|GiB)', stderr) if size_match: value = float(size_match.group(1)) unit = size_match.group(2) multipliers = {"B": 1, "KiB": 1024, "MiB": 1024**2, "GiB": 1024**3} size_in_bytes = int(value * multipliers.get(unit, 1)) details_str += f" ({size_match.group(1)}{unit})" result['downloaded_bytes'] = size_in_bytes result['details'] = details_str else: # Check both stdout and stderr for error messages, as logging might be directed to stdout. full_output = f"{stdout}\n{stderr}" error_lines = [line for line in full_output.strip().split('\n') if 'ERROR:' in line] result['details'] = error_lines[-1].strip() if error_lines else "Unknown error" if is_403_error: result['error_type'] = 'HTTP 403' elif is_timeout_error: result['error_type'] = 'Timeout' else: result['error_type'] = f'Exit Code {retcode}' return result def process_info_json_cycle(path, content, policy, state_manager, proxy_url=None, profile_name=None): """ Processes one info.json file for one cycle, downloading selected formats. """ results = [] display_name = get_display_name(path) d_policy = policy.get('download_policy', {}) s_conditions = policy.get('stop_conditions', {}) format_selection = d_policy.get('formats', '') try: info_data = json.loads(content) available_formats = [f['format_id'] for f in info_data.get('formats', [])] if not available_formats: logger.warning(f"[{display_name}] No formats found in info.json. Skipping.") return [] formats_to_test = [] if format_selection == 'all': formats_to_test = available_formats elif format_selection.startswith('random:'): percent = float(format_selection.split(':')[1].rstrip('%')) count = max(1, int(len(available_formats) * (percent / 100.0))) formats_to_test = random.sample(available_formats, k=count) elif format_selection.startswith('random_from:'): choices = [f.strip() for f in format_selection.split(':', 1)[1].split(',')] valid_choices = [f for f in choices if f in available_formats] if valid_choices: formats_to_test = [random.choice(valid_choices)] else: requested_formats = [f.strip() for f in format_selection.split(',') if f.strip()] formats_to_test = [] for req_fmt in requested_formats: # Check for exact match first if req_fmt in available_formats: formats_to_test.append(req_fmt) continue # If no exact match, check for formats that start with this ID + '-' # e.g., req_fmt '140' should match '140-0' prefix_match = f"{req_fmt}-" first_match = next((af for af in available_formats if af.startswith(prefix_match)), None) if first_match: logger.info(f"[{display_name}] Requested format '{req_fmt}' not found. Using first available match: '{first_match}'.") formats_to_test.append(first_match) else: # This could be a complex selector like 'bestvideo' or '299/298', so keep it. if req_fmt not in available_formats: logger.warning(f"[{display_name}] Requested format '{req_fmt}' not found in available formats.") formats_to_test.append(req_fmt) except json.JSONDecodeError: logger.error(f"[{display_name}] Failed to parse info.json. Skipping.") return [] for i, format_id in enumerate(formats_to_test): if shutdown_event.is_set(): logger.info(f"Shutdown requested, stopping further format tests for {display_name}.") break # Check if the format URL is expired before attempting to download format_details = next((f for f in info_data.get('formats', []) if f.get('format_id') == format_id), None) if format_details and 'url' in format_details: parsed_url = urlparse(format_details['url']) query_params = parse_qs(parsed_url.query) expire_ts_str = query_params.get('expire', [None])[0] if expire_ts_str and expire_ts_str.isdigit(): expire_ts = int(expire_ts_str) if expire_ts < time.time(): logger.warning(f"[{display_name}] Skipping format '{format_id}' because its URL is expired.") result = { 'type': 'download', 'path': str(path), 'format': format_id, 'success': True, 'error_type': 'Skipped', 'details': 'Download URL is expired', 'downloaded_bytes': 0 } if proxy_url: result['proxy_url'] = proxy_url state_manager.log_event(result) results.append(result) continue # Move to the next format result = run_download_worker(path, content, format_id, policy, profile_name=profile_name) if proxy_url: result['proxy_url'] = proxy_url state_manager.log_event(result) results.append(result) worker_id = get_worker_id() status = "SUCCESS" if result['success'] else f"FAILURE ({result['error_type']})" profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" logger.info(f"[Worker {worker_id}]{profile_log_part} Result for {display_name} (format {format_id}): {status} - {result.get('details', 'OK')}") if not result['success']: if s_conditions.get('on_failure') or \ (s_conditions.get('on_http_403') and result['error_type'] == 'HTTP 403') or \ (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): logger.info(f"Stopping further format tests for {display_name} in this cycle due to failure.") break sleep_cfg = d_policy.get('sleep_between_formats', {}) sleep_min = sleep_cfg.get('min_seconds', 0) if sleep_min > 0 and i < len(formats_to_test) - 1: sleep_max = sleep_cfg.get('max_seconds') or sleep_min if sleep_max > sleep_min: sleep_duration = random.uniform(sleep_min, sleep_max) else: sleep_duration = sleep_min logger.debug(f"Sleeping for {sleep_duration:.2f}s between formats for {display_name}.") # Interruptible sleep sleep_end_time = time.time() + sleep_duration while time.time() < sleep_end_time: if shutdown_event.is_set(): break time.sleep(0.2) return results def update_dict(d, u): """Recursively update a dictionary.""" for k, v in u.items(): if isinstance(v, collections.abc.Mapping): d[k] = update_dict(d.get(k, {}), v) else: d[k] = v return d def load_policy(policy_file, policy_name=None): """Load a policy from a YAML file.""" try: with open(policy_file, 'r', encoding='utf-8') as f: # If a policy name is given, look for that specific document if policy_name: docs = list(yaml.safe_load_all(f)) for doc in docs: if isinstance(doc, dict) and doc.get('name') == policy_name: return doc raise ValueError(f"Policy '{policy_name}' not found in {policy_file}") # Otherwise, load the first document return yaml.safe_load(f) except (IOError, yaml.YAMLError, ValueError) as e: logger.error(f"Failed to load policy file {policy_file}: {e}") sys.exit(1) def apply_overrides(policy, overrides): """Apply command-line overrides to the policy.""" for override in overrides: try: key, value = override.split('=', 1) keys = key.split('.') # Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar if (value.startswith('[') and value.endswith(']')) or \ (value.startswith('{') and value.endswith('}')): try: value = yaml.safe_load(value) except yaml.YAMLError: logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.") else: # Try to auto-convert scalar value type if value.lower() == 'true': value = True elif value.lower() == 'false': value = False elif value.lower() == 'null': value = None else: try: value = int(value) except ValueError: try: value = float(value) except ValueError: pass # Keep as string d = policy for k in keys[:-1]: d = d.setdefault(k, {}) d[keys[-1]] = value except ValueError: logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.") sys.exit(1) return policy def display_effective_policy(policy, name, sources=None, profile_names=None, original_workers_setting=None): """Prints a human-readable summary of the effective policy.""" logger.info(f"--- Effective Policy: {name} ---") settings = policy.get('settings', {}) exec_control = policy.get('execution_control', {}) logger.info(f"Mode: {settings.get('mode', 'full_stack')}") if profile_names: num_profiles = len(profile_names) logger.info(f"Profiles found: {num_profiles}") if num_profiles > 0: # Sort profiles for consistent display, show top 10 sorted_profiles = sorted(profile_names) profiles_to_show = sorted_profiles[:10] logger.info(f" (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})") workers_display = str(exec_control.get('workers', 1)) if original_workers_setting == 'auto': workers_display = f"auto (calculated: {workers_display})" logger.info(f"Workers: {workers_display}") sleep_cfg = exec_control.get('sleep_between_tasks', {}) sleep_min = sleep_cfg.get('min_seconds') if sleep_min is not None: sleep_max = sleep_cfg.get('max_seconds') or sleep_min if sleep_max > sleep_min: logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)") else: logger.info(f"Sleep between tasks (per worker): {sleep_min}s") run_until = exec_control.get('run_until', {}) run_conditions = [] if 'minutes' in run_until: run_conditions.append(f"for {run_until['minutes']} minutes") if 'requests' in run_until: run_conditions.append(f"until {run_until['requests']} total requests") if 'cycles' in run_until: run_conditions.append(f"for {run_until['cycles']} cycles") if run_conditions: logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.") if 'minutes' in run_until and 'cycles' not in run_until: logger.info("Will continuously cycle through sources until time limit is reached.") else: logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.") logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).") # --- Rate Calculation --- if sources: workers = exec_control.get('workers', 1) num_sources = len(profile_names) if profile_names else len(sources) min_sleep = sleep_cfg.get('min_seconds', 0) max_sleep = sleep_cfg.get('max_seconds') or min_sleep avg_sleep_per_task = (min_sleep + max_sleep) / 2 # Assume an average task duration. This is a major assumption. mode = settings.get('mode', 'full_stack') assumptions = exec_control.get('assumptions', {}) assumed_fetch_duration = 0 if mode in ['full_stack', 'fetch_only']: assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3) assumed_download_duration = 0 if mode in ['full_stack', 'download_only']: # This assumes the total time to download all formats for a single source. assumed_download_duration = assumptions.get('download_task_duration', 60) total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration if workers > 0 and total_assumed_task_duration > 0: total_time_per_task = total_assumed_task_duration + avg_sleep_per_task tasks_per_minute_per_worker = 60 / total_time_per_task total_tasks_per_minute = tasks_per_minute_per_worker * workers logger.info("--- Rate Estimation ---") logger.info(f"Source count: {num_sources}") if mode in ['full_stack', 'fetch_only']: logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)") if mode in ['full_stack', 'download_only']: logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)") logger.info(" (Note: This assumes total time for all formats per source)") logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s") logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)") target_rate_cfg = exec_control.get('target_rate', {}) target_reqs = target_rate_cfg.get('requests') target_mins = target_rate_cfg.get('per_minutes') if target_reqs and target_mins: target_rpm = target_reqs / target_mins logger.info(f"Target rate: {target_rpm:.2f} tasks/minute") if total_tasks_per_minute < target_rpm * 0.8: logger.warning("Warning: Expected rate is significantly lower than target rate.") logger.warning("Consider increasing workers, reducing sleep, or checking task performance.") logger.info("---------------------------------") time.sleep(2) # Give user time to read def add_stress_policy_parser(subparsers): """Add the parser for the 'stress-policy' command.""" parser = subparsers.add_parser( 'stress-policy', description="The primary, policy-driven stress-testing orchestrator.\nIt runs complex, multi-stage stress tests based on a YAML policy file.\nUse '--list-policies' to see available pre-configured scenarios.\n\nModes supported:\n- full_stack: Generate info.json and then download from it.\n- fetch_only: Only generate info.json files.\n- download_only: Only download from existing info.json files.", formatter_class=argparse.RawTextHelpFormatter, help='Run advanced, policy-driven stress tests (recommended).', epilog=""" Examples: 1. Fetch info.jsons for a TV client with a single profile and a rate limit: ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\ --policy-name tv_downgraded_single_profile \\ --set settings.urls_file=my_urls.txt \\ --set execution_control.run_until.minutes=30 # This runs a 'fetch_only' test using the 'tv_downgraded' client. It uses a single, # static profile for all requests and enforces a safety limit of 450 requests per hour. 2. Fetch info.jsons for an Android client using cookies for authentication: ytops-client stress-policy --policy policies/1_fetch_only_policies.yaml \\ --policy-name android_sdkless_with_cookies \\ --set settings.urls_file=my_urls.txt \\ --set info_json_generation_policy.request_params.cookies_file_path=/path/to/my_cookies.txt # This demonstrates an authenticated 'fetch_only' test. It passes the path to a # Netscape cookie file, which the server will use for the requests. 3. Download from a folder of info.jsons, grouped by profile, with auto-workers: ytops-client stress-policy --policy policies/2_download_only_policies.yaml \\ --policy-name basic_profile_aware_download \\ --set settings.info_json_dir=/path/to/my/infojsons # This runs a 'download_only' test. It scans a directory, extracts profile names from # the filenames (e.g., 'tv_user_1' from '...-VIDEOID-tv_user_1.json'), and groups # them. 'workers=auto' sets the number of workers to the number of unique profiles found. 4. Full-stack test with multiple workers and profile rotation: ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ --policy-name tv_simply_profile_rotation \\ --set settings.urls_file=my_urls.txt \\ --set execution_control.workers=4 \\ --set settings.profile_management.max_requests_per_profile=500 # This runs a 'full_stack' test with 4 parallel workers. Each worker gets a unique # profile (e.g., tv_simply_user_0_0, tv_simply_user_1_0, etc.). After a profile is # used 500 times, it is retired, and a new "generation" is created (e.g., tv_simply_user_0_1). 5. Full-stack authenticated test with a pool of profiles and corresponding cookie files: ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ --policy-name mweb_multi_profile_with_cookies \\ --set settings.urls_file=my_urls.txt \\ --set settings.profile_management.cookie_files='["/path/c1.txt","/path/c2.txt"]' # This runs a 'full_stack' test using a pool of profiles (e.g., mweb_user_0, mweb_user_1). # It uses the 'cookie_files' list to assign a specific cookie file to each profile in the # pool, enabling multi-account authenticated testing. Note the JSON/YAML list format for the override. 6. Full-stack test submitting downloads to an aria2c RPC server: ytops-client stress-policy --policy policies/3_full_stack_policies.yaml \\ --policy-name tv_simply_profile_rotation_aria2c_rpc \\ --set settings.urls_file=my_urls.txt \\ --set download_policy.aria_host=192.168.1.100 \\ --set download_policy.aria_port=6801 # This runs a test where downloads are not performed by the worker itself, but are # sent to a remote aria2c daemon. The policy specifies 'downloader: aria2c_rpc' # and provides connection details. This is useful for offloading download traffic. -------------------------------------------------------------------------------- Overridable Policy Parameters via --set: Key Description -------------------------------------- ------------------------------------------------ [settings] settings.mode Test mode: 'full_stack', 'fetch_only', or 'download_only'. settings.urls_file Path to file with URLs/video IDs. settings.info_json_dir Path to directory with existing info.json files. settings.profile_extraction_regex For 'download_only' mode, a regex to extract profile names from info.json filenames. The first capture group is used as the profile name. E.g., '.*-(.*?).json'. This enables profile-aware sequential downloading. settings.info_json_dir_sample_percent Randomly sample this %% of files from the directory (for 'once' scan mode). settings.directory_scan_mode For 'download_only': 'once' (default) or 'continuous' to watch for new files. settings.mark_processed_files For 'continuous' scan mode: if true, rename processed files to '*..processed' to avoid reprocessing. settings.max_files_per_cycle For 'continuous' scan mode: max new files to process per cycle. settings.sleep_if_no_new_files_seconds For 'continuous' scan mode: seconds to sleep if no new files are found (default: 10). settings.profile_prefix (Legacy) Prefix for profile names (e.g., 'test_user'). settings.profile_pool (Legacy) Size of the profile pool. settings.profile_mode Profile strategy. 'per_request' (legacy), 'per_worker' (legacy), or 'per_worker_with_rotation' (requires profile_management). settings.info_json_script Command to run the info.json generation script (e.g., 'bin/ytops-client get-info'). settings.save_info_json_dir If set, save all successfully generated info.json files to this directory. [settings.profile_management] (New, preferred method for profile control) profile_management.prefix Prefix for profile names (e.g., 'dyn_user'). profile_management.suffix Suffix for profile names. Set to 'auto' for a timestamp, or provide a string. profile_management.initial_pool_size The number of profiles to start with. profile_management.auto_expand_pool If true, create new profiles when the initial pool is exhausted (all sleeping). profile_management.max_requests_per_profile Max requests a profile can make before it must 'sleep'. profile_management.sleep_minutes_on_exhaustion How many minutes a profile 'sleeps' after hitting its request limit. profile_management.cookie_files A list of paths to cookie files. Used to assign a unique cookie file to each profile in a pool. [execution_control] execution_control.workers Number of parallel worker threads. Set to "auto" to calculate from target_rate or number of profiles. execution_control.auto_workers_max The maximum number of workers to use when 'workers' is 'auto' in profile-aware download mode (default: 8). execution_control.target_rate.requests Target requests for 'auto' workers calculation. execution_control.target_rate.per_minutes Period in minutes for target_rate. execution_control.run_until.minutes Stop test after N minutes. Will continuously cycle through sources. execution_control.run_until.cycles Stop test after N cycles. A cycle is one full pass through all sources. execution_control.run_until.requests Stop test after N total info.json requests (cumulative across runs). execution_control.sleep_between_tasks.min_seconds Min sleep time between tasks, per worker. [info_json_generation_policy] info_json_generation_policy.client Client to use (e.g., 'mweb', 'tv_camoufox'). info_json_generation_policy.auth_host Host for the auth/Thrift service. info_json_generation_policy.auth_port Port for the auth/Thrift service. info_json_generation_policy.assigned_proxy_url A specific proxy to use for a request, overriding the server's proxy pool. info_json_generation_policy.proxy_rename Regex substitution for the assigned proxy URL (e.g., 's/old/new/'). info_json_generation_policy.command_template A full command template for the info.json script. Overrides other keys. info_json_generation_policy.rate_limits.per_ip.max_requests Max requests for the given time period from one IP. info_json_generation_policy.rate_limits.per_ip.per_minutes Time period in minutes for the per_ip rate limit. info_json_generation_policy.rate_limits.per_profile.max_requests Max requests for a single profile in a time period. info_json_generation_policy.rate_limits.per_profile.per_minutes Time period in minutes for the per_profile rate limit. info_json_generation_policy.client_rotation_policy.major_client The primary client to use for most requests. info_json_generation_policy.client_rotation_policy.refresh_client The client to use periodically to refresh context. info_json_generation_policy.client_rotation_policy.refresh_every.requests Trigger refresh client after N requests for a profile. [download_policy] download_policy.formats Formats to download (e.g., '18,140', 'random:50%%'). download_policy.downloader Orchestrator script to use: 'native-py' (default, Python lib), 'native-cli' (legacy CLI wrapper), or 'aria2c_rpc'. download_policy.external_downloader For 'native-py' or default, the backend yt-dlp should use (e.g., 'aria2c', 'native'). download_policy.downloader_args Arguments for the external_downloader. For yt-dlp, e.g., 'aria2c:-x 8'. download_policy.merge_output_format Container to merge to (e.g., 'mkv'). Defaults to 'mp4' via cli.config. download_policy.temp_path For 'native-py', path to a directory for temporary files (e.g., a RAM disk like /dev/shm). download_policy.output_to_buffer For 'native-py', download to an in-memory buffer and pipe to stdout instead of saving to a file (true/false). Best for single-file formats. download_policy.proxy Proxy for direct downloads (e.g., "socks5://127.0.0.1:1080"). download_policy.proxy_rename Regex substitution for the proxy URL (e.g., 's/old/new/'). download_policy.pause_before_download_seconds Pause for N seconds before starting each download attempt. download_policy.continue_downloads Enable download continuation (true/false). download_policy.cleanup After success: for native downloaders, rename and truncate file to 0 bytes; for 'aria2c_rpc', remove file(s) from filesystem. download_policy.extra_args A string of extra arguments for the download script (e.g., "--limit-rate 5M"). download_policy.sleep_per_proxy_seconds Cooldown in seconds between downloads on the same proxy. download_policy.rate_limits.per_proxy.max_requests Max downloads for a single proxy in a time period. download_policy.rate_limits.per_proxy.per_minutes Time period in minutes for the per_proxy download rate limit. # For downloader: 'aria2c_rpc' download_policy.aria_host Hostname of the aria2c RPC server. download_policy.aria_port Port of the aria2c RPC server. download_policy.aria_secret Secret token for the aria2c RPC server. download_policy.aria_wait Wait for aria2c downloads to complete (true/false). download_policy.cleanup Remove downloaded file(s) from the filesystem on success. Requires script access to the download directory. download_policy.purge_on_complete On success, purge ALL completed/failed downloads from aria2c history. Use as a workaround for older aria2c versions where targeted removal fails. download_policy.output_dir Output directory for downloads. download_policy.aria_remote_dir The absolute download path on the remote aria2c host. download_policy.aria_fragments_dir The local path to find fragments for merging (if different from output_dir). download_policy.auto_merge_fragments For fragmented downloads, automatically merge parts after download (true/false). Requires aria_wait=true. download_policy.remove_fragments_after_merge For fragmented downloads, delete fragment files after a successful merge (true/false). Requires auto_merge_fragments=true. [stop_conditions] stop_conditions.on_failure Stop on any download failure (true/false). stop_conditions.on_http_403 Stop on any HTTP 403 error (true/false). stop_conditions.on_error_rate.max_errors Stop test if more than N errors (of any type) occur within the time period. stop_conditions.on_error_rate.per_minutes Time period in minutes for the error rate calculation. stop_conditions.on_cumulative_403.max_errors Stop test if more than N HTTP 403 errors occur within the time period. stop_conditions.on_cumulative_403.per_minutes Time period in minutes for the cumulative 403 calculation. stop_conditions.on_quality_degradation.trigger_if_missing_formats A format ID or comma-separated list of IDs. Triggers if any are missing. stop_conditions.on_quality_degradation.max_triggers Stop test if quality degradation is detected N times. stop_conditions.on_quality_degradation.per_minutes Time period in minutes for the quality degradation calculation. -------------------------------------------------------------------------------- """ ) parser.add_argument('--policy', help='Path to the YAML policy file. Required unless --list-policies is used.') parser.add_argument('--policy-name', help='Name of the policy to run from a multi-policy file (if it contains "---" separators).') parser.add_argument('--list-policies', action='store_true', help='List all available policies from the default policies directory and exit.') parser.add_argument('--show-overrides', action='store_true', help='Load the specified policy and print all its defined values as a single-line of --set arguments, then exit.') parser.add_argument('--set', action='append', default=[], help="Override a policy setting using 'key.subkey=value' format.\n(e.g., --set execution_control.workers=5)") # Add a group for aria2c-specific overrides for clarity in --help aria_group = parser.add_argument_group('Aria2c RPC Downloader Overrides', 'Shortcuts for common --set options for the aria2c_rpc downloader.') aria_group.add_argument('--auto-merge-fragments', action=argparse.BooleanOptionalAction, default=None, help='Shortcut to enable/disable download_policy.auto_merge_fragments.') aria_group.add_argument('--remove-fragments-after-merge', action=argparse.BooleanOptionalAction, default=None, help='Shortcut to enable/disable download_policy.remove_fragments_after_merge.') aria_group.add_argument('--fragments-dir', help='Shortcut for --set download_policy.aria_fragments_dir=PATH.') aria_group.add_argument('--remote-dir', help='Shortcut for --set download_policy.aria_remote_dir=PATH.') aria_group.add_argument('--cleanup', action=argparse.BooleanOptionalAction, default=None, help='Shortcut to enable/disable download_policy.cleanup.') parser.add_argument('--verbose', action='store_true', help='Enable verbose output for the orchestrator and underlying scripts.') parser.add_argument('--dry-run', action='store_true', help='Print the effective policy and exit without running the test.') return parser def list_policies(): """Scans the policies directory and prints a list of available policies.""" script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(script_dir, '..')) policies_dir = os.path.join(project_root, 'policies') if not os.path.isdir(policies_dir): print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr) return 1 print("Available Policies:") print("=" * 20) policy_files = sorted(Path(policies_dir).glob('*.yaml')) if not policy_files: print("No policy files (.yaml) found.") return 0 for policy_file in policy_files: print(f"\n--- File: {policy_file.relative_to(project_root)} ---") try: with open(policy_file, 'r', encoding='utf-8') as f: content = f.read() # Split into documents. The separator is a line that is exactly '---'. documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE) found_any_in_file = False for doc in documents: doc = doc.strip() if not doc: continue lines = doc.split('\n') policy_name = None description_lines = [] # Find name and description for i, line in enumerate(lines): if line.strip().startswith('name:'): policy_name = line.split(':', 1)[1].strip() # Look backwards for comments j = i - 1 current_desc_block = [] while j >= 0 and lines[j].strip().startswith('#'): comment = lines[j].strip().lstrip('#').strip() current_desc_block.insert(0, comment) j -= 1 if current_desc_block: description_lines = current_desc_block break if policy_name: found_any_in_file = True print(f" - Name: {policy_name}") if description_lines: # Heuristic to clean up "Policy: " prefix if description_lines[0].lower().startswith('policy:'): description_lines[0] = description_lines[0][len('policy:'):].strip() print(f" Description: {description_lines[0]}") for desc_line in description_lines[1:]: print(f" {desc_line}") else: print(" Description: (No description found)") relative_path = policy_file.relative_to(project_root) print(f" Usage: --policy {relative_path} --policy-name {policy_name}") if not found_any_in_file: print(" (No named policies found in this file)") except Exception as e: print(f" Error parsing {policy_file.name}: {e}") return 0 def main_stress_policy(args): """Main logic for the 'stress-policy' command.""" if args.list_policies: return list_policies() if not args.policy: print("Error: --policy is required unless using --list-policies.", file=sys.stderr) return 1 # Handle --show-overrides early, as it doesn't run the test. if args.show_overrides: policy = load_policy(args.policy, args.policy_name) if not policy: return 1 # load_policy prints its own error print_policy_overrides(policy) return 0 log_level = logging.DEBUG if args.verbose else logging.INFO log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if args.verbose else '%(asctime)s - %(message)s' date_format = None if args.verbose else '%H:%M:%S' logging.basicConfig(level=log_level, format=log_format, datefmt=date_format, stream=sys.stdout) policy = load_policy(args.policy, args.policy_name) policy = apply_overrides(policy, args.set) # Apply direct CLI overrides after --set, so they have final precedence. if args.auto_merge_fragments is not None: policy.setdefault('download_policy', {})['auto_merge_fragments'] = args.auto_merge_fragments if args.remove_fragments_after_merge is not None: policy.setdefault('download_policy', {})['remove_fragments_after_merge'] = args.remove_fragments_after_merge if args.fragments_dir is not None: policy.setdefault('download_policy', {})['aria_fragments_dir'] = args.fragments_dir if args.remote_dir is not None: policy.setdefault('download_policy', {})['aria_remote_dir'] = args.remote_dir if args.cleanup is not None: policy.setdefault('download_policy', {})['cleanup'] = args.cleanup policy_name = policy.get('name', args.policy_name or Path(args.policy).stem) state_manager = StateManager(policy_name) # --- Graceful shutdown handler --- def shutdown_handler(signum, frame): if not shutdown_event.is_set(): logger.info(f"\nSignal {signum} received, shutting down gracefully...") shutdown_event.set() # Save state immediately to prevent loss on interrupt. logger.info("Attempting to save state before shutdown...") state_manager.close() # Kill running subprocesses to unblock workers with process_lock: if running_processes: logger.info(f"Terminating {len(running_processes)} running subprocess(es)...") for p in running_processes: try: # Kill the entire process group to ensure child processes (like yt-dlp) are terminated. os.killpg(os.getpgid(p.pid), signal.SIGKILL) except (ProcessLookupError, PermissionError): pass # Process already finished or we lack permissions logger.info("Subprocesses terminated. Waiting for workers to finish. Press Ctrl+C again to force exit.") else: logger.info("Second signal received, forcing exit.") # Use os._exit for a hard exit that doesn't run cleanup handlers, # which can deadlock if locks are held. os._exit(1) signal.signal(signal.SIGINT, shutdown_handler) signal.signal(signal.SIGTERM, shutdown_handler) settings = policy.get('settings', {}) # --- Load sources based on mode --- mode = settings.get('mode', 'full_stack') sources = [] # This will be a list of URLs or Path objects if mode in ['full_stack', 'fetch_only']: urls_file = settings.get('urls_file') if not urls_file: logger.error("Policy mode requires 'settings.urls_file'.") return 1 try: with open(urls_file, 'r', encoding='utf-8') as f: content = f.read() try: data = json.loads(content) if isinstance(data, list) and all(isinstance(item, str) for item in data): sources = data logger.info(f"Loaded {len(sources)} URLs/IDs from JSON array in {urls_file}.") else: logger.error(f"URL file '{urls_file}' is valid JSON but not an array of strings.") return 1 except json.JSONDecodeError: sources = [line.strip() for line in content.splitlines() if line.strip()] logger.info(f"Loaded {len(sources)} URLs/IDs from text file {urls_file}.") except IOError as e: logger.error(f"Failed to read urls_file {urls_file}: {e}") return 1 # Clean up URLs/IDs which might have extra quotes, commas, or brackets from copy-pasting cleaned_sources = [] for source in sources: cleaned_source = source.strip().rstrip(',').strip().strip('\'"[]').strip() if cleaned_source: cleaned_sources.append(cleaned_source) if len(cleaned_sources) != len(sources): logger.info(f"Cleaned URL list, removed {len(sources) - len(cleaned_sources)} empty or invalid entries.") sources = cleaned_sources elif mode == 'download_only': # If not in continuous mode, load sources once at the start. # In continuous mode, `sources` is populated at the start of each cycle. if settings.get('directory_scan_mode') != 'continuous': info_json_dir = settings.get('info_json_dir') if not info_json_dir: logger.error("Policy mode 'download_only' requires 'settings.info_json_dir'.") return 1 try: all_files = sorted(Path(info_json_dir).glob('*.json')) sample_percent = settings.get('info_json_dir_sample_percent') if sample_percent and 0 < sample_percent <= 100: sample_count = int(len(all_files) * (sample_percent / 100.0)) num_to_sample = min(len(all_files), max(1, sample_count)) sources = random.sample(all_files, k=num_to_sample) logger.info(f"Randomly sampled {len(sources)} files ({sample_percent}%) from {info_json_dir}") else: sources = all_files except (IOError, FileNotFoundError) as e: logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}") return 1 # In continuous download mode, sources are loaded inside the loop, so we skip this check. if settings.get('directory_scan_mode') != 'continuous' and not sources: logger.error("No sources (URLs or info.json files) to process. Exiting.") return 1 # --- Group sources by profile if in download_only mode with regex --- profile_tasks = None task_items = sources # Default to list of sources profile_extraction_regex = settings.get('profile_extraction_regex') if mode == 'download_only' and profile_extraction_regex: logger.info(f"Grouping info.json files by profile using regex: {profile_extraction_regex}") profile_tasks = collections.defaultdict(list) for source_path in sources: profile_name = get_profile_from_filename(source_path, profile_extraction_regex) if profile_name: profile_tasks[profile_name].append(source_path) else: # Assign to a default profile if no match profile_tasks['unmatched_profile'].append(source_path) num_profiles = len(profile_tasks) logger.info(f"Found {num_profiles} unique profiles. Tasks will be processed sequentially per profile.") # The new "sources" for the purpose of task distribution are the profiles. task_items = list(profile_tasks.items()) # --- Auto-calculate workers if needed --- exec_control = policy.get('execution_control', {}) original_workers_setting = exec_control.get('workers') if original_workers_setting == 'auto': if mode == 'download_only' and profile_tasks is not None: num_profiles = len(profile_tasks) # Use auto_workers_max from policy, with a default of 8. max_workers = exec_control.get('auto_workers_max', 8) num_workers = min(num_profiles, max_workers) exec_control['workers'] = max(1, num_workers) logger.info(f"Calculated 'auto' workers based on {num_profiles} profiles (max: {max_workers}): {exec_control['workers']}") else: target_rate_cfg = exec_control.get('target_rate', {}) target_reqs = target_rate_cfg.get('requests') target_mins = target_rate_cfg.get('per_minutes') if target_reqs and target_mins and sources: target_rpm = target_reqs / target_mins num_sources = len(sources) sleep_cfg = exec_control.get('sleep_between_tasks', {}) avg_sleep = (sleep_cfg.get('min_seconds', 0) + sleep_cfg.get('max_seconds', 0)) / 2 assumed_task_duration = 12 # Must match assumption in display_effective_policy # Formula: workers = (total_work_seconds) / (total_time_for_work) # total_time_for_work is derived from the target rate: # (total_cycle_time) = (60 * num_sources) / target_rpm # total_time_for_work = total_cycle_time - avg_sleep work_time_available = (60 * num_sources / target_rpm) - avg_sleep if work_time_available <= 0: # The sleep time alone makes the target rate impossible. # Set workers to max parallelism as a best-effort. num_workers = num_sources logger.warning(f"Target rate of {target_rpm} req/min is likely unachievable due to sleep time of {avg_sleep}s.") logger.warning(f"Setting workers to max parallelism ({num_workers}) as a best effort.") else: total_work_seconds = num_sources * assumed_task_duration num_workers = total_work_seconds / work_time_available calculated_workers = max(1, int(num_workers + 0.99)) # Ceiling exec_control['workers'] = calculated_workers logger.info(f"Calculated 'auto' workers based on target rate: {calculated_workers}") else: logger.warning("Cannot calculate 'auto' workers: 'target_rate' or sources are not defined. Defaulting to 1 worker.") exec_control['workers'] = 1 display_effective_policy( policy, policy_name, sources=sources, profile_names=list(profile_tasks.keys()) if profile_tasks is not None else None, original_workers_setting=original_workers_setting ) if args.dry_run: logger.info("Dry run complete. Exiting.") return 0 start_time = time.time() run_until_cfg = exec_control.get('run_until', {}) duration_seconds = (run_until_cfg.get('minutes') or 0) * 60 max_cycles = run_until_cfg.get('cycles') or 0 max_requests = run_until_cfg.get('requests') or 0 # --- Main test loop --- cycles = 0 try: def process_task(source, source_index, cycle_num): """Worker task for one source (URL or file path).""" try: if shutdown_event.is_set(): return [] # Shutdown initiated, do not start new work # --- Step 1: Get info.json content --- info_json_content = None if mode in ['full_stack', 'fetch_only']: gen_policy = policy.get('info_json_generation_policy', {}) cmd_template = gen_policy.get('command_template') # --- Profile Generation --- profile_name = None profile_mode = settings.get('profile_mode') pm_policy = settings.get('profile_management') if profile_mode == 'per_worker_with_rotation': if not pm_policy: logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration.") # Log a failure event and skip event = {'type': 'fetch', 'path': str(source), 'success': False, 'error_type': 'ConfigError', 'details': 'Missing profile_management section'} state_manager.log_event(event) return [] worker_id = get_worker_id() profile_name = state_manager.get_or_rotate_worker_profile(worker_id, policy) elif pm_policy: # This is the existing dynamic cooldown logic profile_name = state_manager.get_next_available_profile(policy) if not profile_name: logger.warning("No available profiles to run task. Skipping.") return [] else: # This is the legacy logic profile_prefix = settings.get('profile_prefix') if profile_prefix: if profile_mode == 'per_request': timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f') profile_name = f"{profile_prefix}_{timestamp}_{source_index}" elif profile_mode == 'per_worker': worker_index = get_worker_id() profile_name = f"{profile_prefix}_{worker_index}" else: # Default to pool logic profile_pool = settings.get('profile_pool') if profile_pool: profile_name = f"{profile_prefix}_{source_index % profile_pool}" else: profile_name = "default" # A final fallback # --- Rate Limit Check --- if not state_manager.check_and_update_rate_limit(profile_name, policy): return [] # Rate limited, skip this task # --- Command Generation --- gen_cmd = [] save_dir = settings.get('save_info_json_dir') save_path = None if cmd_template: # Low-level template mode. The user is responsible for output. video_id = get_video_id(source) # A heuristic to add '--' if the video ID looks like an option. # We split the template, find the standalone '{url}' placeholder, # and insert '--' before it. This assumes it's a positional argument. template_parts = shlex.split(cmd_template) try: # Find from the end, in case it's used in an option value earlier. url_index = len(template_parts) - 1 - template_parts[::-1].index('{url}') if video_id.startswith('-'): template_parts.insert(url_index, '--') except ValueError: # '{url}' not found as a standalone token, do nothing special. pass # Rejoin and then format the whole string. gen_cmd_str = ' '.join(template_parts) gen_cmd_str = gen_cmd_str.format(url=video_id, profile=profile_name) gen_cmd = shlex.split(gen_cmd_str) if args.verbose and '--verbose' not in gen_cmd: gen_cmd.append('--verbose') else: # High-level policy mode. Orchestrator builds the command. script_cmd_str = settings.get('info_json_script') if not script_cmd_str: logger.error("High-level policy requires 'settings.info_json_script'.") return [] gen_cmd = shlex.split(script_cmd_str) video_id = get_video_id(source) client_to_use, request_params = state_manager.get_client_for_request(profile_name, gen_policy) # --- Multi-Cookie File Logic --- if pm_policy: cookie_files = pm_policy.get('cookie_files') if cookie_files and isinstance(cookie_files, list) and len(cookie_files) > 0: profile_index = -1 # Extract index from profile name. Matches _ or __ match = re.search(r'_(\d+)(?:_(\d+))?$', profile_name) if match: # For rotation mode, the first group is worker_id. For pool mode, it's the profile index. profile_index = int(match.group(1)) if profile_index != -1: cookie_file_path = cookie_files[profile_index % len(cookie_files)] if not request_params: request_params = {} request_params['cookies_file_path'] = cookie_file_path logger.info(f"[{source}] Assigned cookie file '{os.path.basename(cookie_file_path)}' to profile '{profile_name}'") else: logger.warning(f"[{source}] Could not determine index for profile '{profile_name}' to assign cookie file.") if client_to_use: gen_cmd.extend(['--client', str(client_to_use)]) if gen_policy.get('auth_host'): gen_cmd.extend(['--auth-host', str(gen_policy.get('auth_host'))]) if gen_policy.get('auth_port'): gen_cmd.extend(['--auth-port', str(gen_policy.get('auth_port'))]) if profile_name != "default": gen_cmd.extend(['--profile', profile_name]) # Add --print-proxy so we can track it for stats if '--print-proxy' not in gen_cmd: gen_cmd.append('--print-proxy') if request_params: gen_cmd.extend(['--request-params-json', json.dumps(request_params)]) if gen_policy.get('assigned_proxy_url'): gen_cmd.extend(['--assigned-proxy-url', str(gen_policy.get('assigned_proxy_url'))]) if gen_policy.get('proxy_rename'): gen_cmd.extend(['--proxy-rename', str(gen_policy.get('proxy_rename'))]) if args.verbose: gen_cmd.append('--verbose') # If saving is enabled, delegate saving to the client script. if save_dir: try: os.makedirs(save_dir, exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Note: Using a timestamped filename to avoid race conditions. filename = f"{timestamp}-{video_id}-{profile_name}.json" save_path = Path(save_dir) / filename gen_cmd.extend(['--output', str(save_path)]) # No longer need to suppress, it's the default. except IOError as e: logger.error(f"[{source}] Could not prepare save path in '{save_dir}': {e}") # Continue without saving save_path = None # If not saving to a file, we need the output on stdout for the download step. if not save_dir: gen_cmd.append('--print-info-out') # The positional video_id argument must come after all options. # Use '--' to ensure it's not parsed as an option if it starts with a dash. if video_id.startswith('-'): gen_cmd.append('--') gen_cmd.append(video_id) worker_id = get_worker_id() profile_log_part = f" [Profile: {profile_name}]" if profile_name else "" logger.info(f"[Worker {worker_id}]{profile_log_part} [{source}] Running info.json command: {' '.join(shlex.quote(s) for s in gen_cmd)}") retcode, stdout, stderr = run_command(gen_cmd) info_json_content = stdout # --- Extract proxy from stderr and record it for stats --- proxy_url = None proxy_match = re.search(r"Proxy used: (.*)", stderr) if proxy_match: proxy_url = proxy_match.group(1).strip() state_manager.record_proxy_usage(proxy_url) if retcode == 0: # If the client script saved the file, stdout will be empty. # If we need the content for a download step, we must read it back. if not info_json_content.strip(): # Check stderr for the success message to confirm save. saved_path_match = re.search(r"Successfully saved info.json to (.*)", stderr) if saved_path_match: output_file_str = saved_path_match.group(1).strip().strip("'\"") logger.info(f"[{source}] -> {saved_path_match.group(0).strip()}") # If this is a full_stack test, we need the content for the download worker. if mode == 'full_stack': try: with open(output_file_str, 'r', encoding='utf-8') as f: info_json_content = f.read() except IOError as e: logger.error(f"Could not read back info.json from '{output_file_str}': {e}") retcode = -1 # Treat as failure elif save_path: # Command was told to save, but didn't confirm. Assume it worked if exit code is 0. logger.info(f"[{source}] -> Client script exited 0, assuming info.json was saved to '{save_path}'") if mode == 'full_stack': try: with open(save_path, 'r', encoding='utf-8') as f: info_json_content = f.read() except IOError as e: logger.error(f"Could not read back info.json from '{save_path}': {e}") retcode = -1 # If stdout is empty and we weren't saving, it's an issue. elif not save_path and not cmd_template: logger.error(f"[{source}] info.json generation gave no stdout and was not asked to save to a file.") retcode = -1 else: logger.info(f"[{source}] -> Successfully fetched info.json to memory/stdout.") event = {'type': 'fetch', 'path': str(source), 'profile': profile_name} if proxy_url: event['proxy_url'] = proxy_url if retcode != 0: error_lines = [line for line in stderr.strip().split('\n') if 'error' in line.lower()] error_msg = error_lines[-1] if error_lines else stderr.strip().split('\n')[-1] logger.error(f"[{source}] Failed to generate info.json: {error_msg}") event.update({'success': False, 'error_type': 'GetInfoJsonFail', 'details': error_msg}) state_manager.log_event(event) return [] # Check for quality degradation before logging success s_conditions = policy.get('stop_conditions', {}) quality_policy = s_conditions.get('on_quality_degradation') if quality_policy and info_json_content: try: info_data = json.loads(info_json_content) available_formats = {f.get('format_id') for f in info_data.get('formats', [])} required_formats = quality_policy.get('trigger_if_missing_formats') if required_formats: # Can be a single string, a comma-separated string, or a list of strings. if isinstance(required_formats, str): required_formats = [f.strip() for f in required_formats.split(',')] missing_formats = [f for f in required_formats if f not in available_formats] if missing_formats: logger.warning(f"[{source}] Quality degradation detected. Missing required formats: {', '.join(missing_formats)}.") event['quality_degradation_trigger'] = True event['missing_formats'] = missing_formats except (json.JSONDecodeError, TypeError): logger.warning(f"[{source}] Could not parse info.json or find formats to check for quality degradation.") # Record request for profile cooldown policy if active if pm_policy: state_manager.record_profile_request(profile_name) state_manager.increment_request_count() event.update({'success': True, 'details': 'OK'}) state_manager.log_event(event) # Saving is now delegated to the client script when a save_dir is provided. # The orchestrator no longer saves the file itself. elif mode == 'download_only': # This path is for non-profile-grouped download_only mode. try: with open(source, 'r', encoding='utf-8') as f: info_json_content = f.read() except (IOError, FileNotFoundError) as e: logger.error(f"[{get_display_name(source)}] Could not read info.json file: {e}") return [] if mode != 'fetch_only': return _run_download_logic(source, info_json_content, policy, state_manager, profile_name=profile_name) return [] finally: # Sleep after the task is completed to space out requests from this worker. exec_control = policy.get('execution_control', {}) sleep_cfg = exec_control.get('sleep_between_tasks', {}) sleep_min = sleep_cfg.get('min_seconds', 0) if sleep_min > 0: sleep_max = sleep_cfg.get('max_seconds') or sleep_min if sleep_max > sleep_min: sleep_duration = random.uniform(sleep_min, sleep_max) else: sleep_duration = sleep_min logger.debug(f"Worker sleeping for {sleep_duration:.2f}s after task for {get_display_name(source)}.") # Interruptible sleep sleep_end_time = time.time() + sleep_duration while time.time() < sleep_end_time: if shutdown_event.is_set(): break time.sleep(0.2) while not shutdown_event.is_set(): if duration_seconds and (time.time() - start_time) > duration_seconds: logger.info("Reached duration limit. Stopping.") break if max_requests > 0 and state_manager.get_request_count() >= max_requests: logger.info(f"Reached max requests ({max_requests}). Stopping.") break # --- Rescan for sources if in continuous download mode --- if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': info_json_dir = settings.get('info_json_dir') try: all_files_in_dir = Path(info_json_dir).glob('*.json') processed_files = state_manager.get_processed_files() new_files = [f for f in all_files_in_dir if str(f) not in processed_files] # Sort by modification time, oldest first, to process in order of creation new_files.sort(key=os.path.getmtime) max_files_per_cycle = settings.get('max_files_per_cycle') if max_files_per_cycle and len(new_files) > max_files_per_cycle: sources = new_files[:max_files_per_cycle] else: sources = new_files if not sources: sleep_duration = settings.get('sleep_if_no_new_files_seconds', 10) logger.info(f"No new info.json files found in '{info_json_dir}'. Sleeping for {sleep_duration}s...") # Interruptible sleep sleep_end_time = time.time() + sleep_duration while time.time() < sleep_end_time: if shutdown_event.is_set(): break time.sleep(0.5) if shutdown_event.is_set(): break continue # Skip to next iteration of the while loop except (IOError, FileNotFoundError) as e: logger.error(f"Failed to read info_json_dir {info_json_dir}: {e}. Retrying in 10s.") time.sleep(10) continue cycles += 1 if max_cycles > 0 and cycles > max_cycles: logger.info(f"Reached max cycles ({max_cycles}). Stopping.") break logger.info(f"--- Cycle #{cycles} (Total Requests: {state_manager.get_request_count()}) ---") with concurrent.futures.ThreadPoolExecutor(max_workers=exec_control.get('workers', 1)) as executor: if mode == 'download_only' and profile_tasks is not None: # New: submit profile tasks future_to_source = { executor.submit(process_profile_task, profile_name, file_list, policy, state_manager, cycles): profile_name for profile_name, file_list in task_items } else: # Old: submit individual file/url tasks future_to_source = { executor.submit(process_task, source, i, cycles): source for i, source in enumerate(task_items) } should_stop = False pending_futures = set(future_to_source.keys()) while pending_futures and not should_stop: done, pending_futures = concurrent.futures.wait( pending_futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: if shutdown_event.is_set(): should_stop = True break source = future_to_source[future] try: results = future.result() # Mark file as processed in continuous download mode if mode == 'download_only' and settings.get('directory_scan_mode') == 'continuous': state_manager.mark_file_as_processed(source) if settings.get('mark_processed_files'): try: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') new_path = source.parent / f"{source.name}.{timestamp}.processed" source.rename(new_path) logger.info(f"Marked '{source.name}' as processed by renaming to '{new_path.name}'") except (IOError, OSError) as e: logger.error(f"Failed to rename processed file '{source.name}': {e}") for result in results: if not result['success']: s_conditions = policy.get('stop_conditions', {}) is_cumulative_403_active = s_conditions.get('on_cumulative_403', {}).get('max_errors') if s_conditions.get('on_failure') or \ (s_conditions.get('on_http_403') and not is_cumulative_403_active and result['error_type'] == 'HTTP 403') or \ (s_conditions.get('on_timeout') and result['error_type'] == 'Timeout'): logger.info(f"!!! STOP CONDITION MET: Immediate stop on failure '{result['error_type']}' for {get_display_name(source)}. Shutting down all workers. !!!") should_stop = True break except concurrent.futures.CancelledError: logger.info(f"Task for {get_display_name(source)} was cancelled during shutdown.") event = { 'type': 'fetch' if mode != 'download_only' else 'download', 'path': str(source), 'success': False, 'error_type': 'Cancelled', 'details': 'Task cancelled during shutdown.' } state_manager.log_event(event) except Exception as exc: logger.error(f'{get_display_name(source)} generated an exception: {exc}') if should_stop: break # Check for cumulative error rate stop conditions s_conditions = policy.get('stop_conditions', {}) error_rate_policy = s_conditions.get('on_error_rate') if error_rate_policy and not should_stop: max_errors = error_rate_policy.get('max_errors') per_minutes = error_rate_policy.get('per_minutes') if max_errors and per_minutes: error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes) if error_count > 0: logger.info(f"!!! STOP CONDITION MET: Error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!") should_stop = True cumulative_403_policy = s_conditions.get('on_cumulative_403') if cumulative_403_policy and not should_stop: max_errors = cumulative_403_policy.get('max_errors') per_minutes = cumulative_403_policy.get('per_minutes') if max_errors and per_minutes: error_count = state_manager.check_cumulative_error_rate(max_errors, per_minutes, error_type='HTTP 403') if error_count > 0: logger.info(f"!!! STOP CONDITION MET: Cumulative 403 error rate exceeded: {error_count} errors in the last {per_minutes} minute(s). Shutting down. !!!") should_stop = True quality_degradation_policy = s_conditions.get('on_quality_degradation') if quality_degradation_policy and not should_stop: max_triggers = quality_degradation_policy.get('max_triggers') per_minutes = quality_degradation_policy.get('per_minutes') if max_triggers and per_minutes: trigger_count = state_manager.check_quality_degradation_rate(max_triggers, per_minutes) if trigger_count > 0: logger.info(f"!!! STOP CONDITION MET: Quality degradation triggered {trigger_count} times in the last {per_minutes} minute(s). Shutting down. !!!") should_stop = True if should_stop: break # Check for duration limit after each task completes if duration_seconds and (time.time() - start_time) > duration_seconds: logger.info("Reached duration limit. Cancelling remaining tasks.") should_stop = True if should_stop and pending_futures: logger.info(f"Cancelling {len(pending_futures)} outstanding task(s).") for future in pending_futures: future.cancel() if should_stop: break if max_cycles > 0 and cycles >= max_cycles: break logger.info("Cycle complete.") except KeyboardInterrupt: logger.info("\nForceful shutdown requested...") finally: state_manager.print_summary(policy) state_manager.close() return 0