import collections import collections.abc import json import logging import re import threading import time from datetime import datetime from pathlib import Path from . import utils as sp_utils logger = logging.getLogger(__name__) class StateManager: """Tracks statistics, manages rate limits, and persists state across runs.""" def __init__(self, policy_name, disable_log_writing=False, shutdown_event=None): self.disable_log_writing = disable_log_writing self.state_file_path = Path(f"{policy_name}_state.json") self.stats_file_path = Path(f"{policy_name}_stats.jsonl") self.lock = threading.RLock() self.start_time = time.time() self.shutdown_event = shutdown_event or threading.Event() self.events = [] self.state = { 'global_request_count': 0, 'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]} 'profile_request_counts': {}, # for client rotation 'profile_last_refresh_time': {}, # for client rotation 'proxy_last_finish_time': {}, # for per-proxy sleep 'processed_files': [], # For continuous download_only mode # For dynamic profile cooldown strategy 'profile_cooldown_counts': {}, 'profile_cooldown_sleep_until': {}, 'profile_pool_size': 0, 'profile_run_suffix': None, 'worker_profile_generations': {}, 'last_url_index': 0, # For batch modes 'total_batches_processed': 0, 'successful_batches': 0, 'failed_batches': 0, 'total_videos_processed': 0, } self.stats_file_handle = None self._load_state() self.print_historical_summary() self._open_stats_log() def _load_state(self): if self.disable_log_writing: logger.info("Log writing is disabled. State will not be loaded from disk.") return if not self.state_file_path.exists(): logger.info(f"State file not found at '{self.state_file_path}', starting fresh.") return try: with open(self.state_file_path, 'r', encoding='utf-8') as f: self.state = json.load(f) # Ensure keys exist self.state.setdefault('global_request_count', 0) self.state.setdefault('rate_limit_trackers', {}) self.state.setdefault('profile_request_counts', {}) self.state.setdefault('profile_last_refresh_time', {}) self.state.setdefault('proxy_last_finish_time', {}) self.state.setdefault('processed_files', []) # For dynamic profile cooldown strategy self.state.setdefault('profile_cooldown_counts', {}) self.state.setdefault('profile_cooldown_sleep_until', {}) self.state.setdefault('profile_pool_size', 0) self.state.setdefault('profile_run_suffix', None) self.state.setdefault('worker_profile_generations', {}) self.state.setdefault('last_url_index', 0) # For batch modes self.state.setdefault('total_batches_processed', 0) self.state.setdefault('successful_batches', 0) self.state.setdefault('failed_batches', 0) self.state.setdefault('total_videos_processed', 0) logger.info(f"Loaded state from {self.state_file_path}") except (IOError, json.JSONDecodeError) as e: logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.") def _save_state(self): if self.disable_log_writing: return with self.lock: try: with open(self.state_file_path, 'w', encoding='utf-8') as f: json.dump(self.state, f, indent=2) logger.info(f"Saved state to {self.state_file_path}") except IOError as e: logger.error(f"Could not save state to {self.state_file_path}: {e}") def _open_stats_log(self): if self.disable_log_writing: return try: self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8') except IOError as e: logger.error(f"Could not open stats file {self.stats_file_path}: {e}") def close(self): """Saves state and closes file handles.""" self._save_state() if self.stats_file_handle: self.stats_file_handle.close() self.stats_file_handle = None def mark_file_as_processed(self, file_path): """Adds a file path to the list of processed files in the state.""" with self.lock: # Using a list and checking for existence is fine for moderate numbers of files. # A set isn't JSON serializable. processed = self.state.setdefault('processed_files', []) file_str = str(file_path) if file_str not in processed: processed.append(file_str) def get_last_url_index(self): """Gets the last URL index to start from.""" with self.lock: return self.state.get('last_url_index', 0) def get_next_url_batch(self, count, urls_list): """Gets the next batch of URLs to process, updating the state.""" with self.lock: start_index = self.state.get('last_url_index', 0) if start_index >= len(urls_list): return [], start_index # No more URLs end_index = start_index + count batch = urls_list[start_index:end_index] # Update state with the index of the *next* URL to be processed. self.state['last_url_index'] = end_index return batch, start_index def update_last_url_index(self, index, force=False): """Updates the last processed URL index in the state. Args: index: The index of the *next* URL to process. force: If True, sets the index regardless of the current value. """ with self.lock: if force or index > self.state.get('last_url_index', 0): self.state['last_url_index'] = index def get_processed_files(self): """Returns a set of file paths that have been processed.""" with self.lock: return set(self.state.get('processed_files', [])) def record_batch_result(self, success, video_count, profile_name=None): with self.lock: self.state['total_batches_processed'] = self.state.get('total_batches_processed', 0) + 1 self.state['total_videos_processed'] = self.state.get('total_videos_processed', 0) + video_count if success: self.state['successful_batches'] = self.state.get('successful_batches', 0) + 1 else: self.state['failed_batches'] = self.state.get('failed_batches', 0) + 1 # Print live counter total = self.state['total_batches_processed'] ok = self.state['successful_batches'] fail = self.state['failed_batches'] profile_log = f" [{profile_name}]" if profile_name else "" logger.info(f"Batch #{total} complete.{profile_log} (Total OK: {ok}, Total Fail: {fail})") def print_historical_summary(self): """Prints a summary based on the state loaded from disk, before new events.""" with self.lock: now = time.time() rate_trackers = self.state.get('rate_limit_trackers', {}) total_requests = self.state.get('global_request_count', 0) if not rate_trackers and not total_requests: logger.info("No historical data found in state file.") return logger.info("\n--- Summary From Previous Runs ---") logger.info(f"Total info.json requests (all previous runs): {total_requests}") if rate_trackers: for key, timestamps in sorted(rate_trackers.items()): # Time windows in seconds windows = { 'last 10 min': 600, 'last 60 min': 3600, 'last 6 hours': 21600, 'last 24 hours': 86400 } rates_str_parts = [] for name, seconds in windows.items(): count = sum(1 for ts in timestamps if now - ts <= seconds) # Calculate rate in requests per minute rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)") logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts)) logger.info("------------------------------------") def log_event(self, event_data): with self.lock: event_data['timestamp'] = datetime.now().isoformat() self.events.append(event_data) if self.stats_file_handle: self.stats_file_handle.write(json.dumps(event_data) + '\n') self.stats_file_handle.flush() def get_request_count(self): with self.lock: return self.state.get('global_request_count', 0) def increment_request_count(self): with self.lock: self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1 def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None): """ Checks if a cumulative error rate has been exceeded. If error_type is None, checks for any failure. Returns the number of errors found if the threshold is met, otherwise 0. """ with self.lock: now = time.time() window_seconds = per_minutes * 60 if error_type: recent_errors = [ e for e in self.events if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] else: # Generic failure check recent_errors = [ e for e in self.events # Only count failures that are not explicitly tolerated if not e.get('success') and not e.get('is_tolerated_error') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] if len(recent_errors) >= max_errors: return len(recent_errors) return 0 def check_quality_degradation_rate(self, max_triggers, per_minutes): """ Checks if the quality degradation trigger rate has been exceeded. Returns the number of triggers found if the threshold is met, otherwise 0. """ with self.lock: now = time.time() window_seconds = per_minutes * 60 recent_triggers = [ e for e in self.events if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds ] if len(recent_triggers) >= max_triggers: return len(recent_triggers) return 0 def check_and_update_rate_limit(self, profile_name, policy): """ Checks if a request is allowed based on policy rate limits. If allowed, updates the internal state. Returns True if allowed, False otherwise. """ with self.lock: now = time.time() gen_policy = policy.get('info_json_generation_policy', {}) rate_limits = gen_policy.get('rate_limits', {}) # Check per-IP limit ip_limit = rate_limits.get('per_ip') if ip_limit: tracker_key = 'per_ip' max_req = ip_limit.get('max_requests') period_min = ip_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) # Filter out old timestamps timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning("Per-IP rate limit reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # Check per-profile limit profile_limit = rate_limits.get('per_profile') if profile_limit and profile_name: tracker_key = f"profile_{profile_name}" max_req = profile_limit.get('max_requests') period_min = profile_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # If all checks pass, record the new request timestamp for all relevant trackers if ip_limit and ip_limit.get('max_requests'): self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now) if profile_limit and profile_limit.get('max_requests') and profile_name: self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now) return True def get_client_for_request(self, profile_name, gen_policy): """ Determines which client to use based on the client_rotation_policy. Returns a tuple: (client_name, request_params_dict). """ with self.lock: rotation_policy = gen_policy.get('client_rotation_policy') # If no rotation policy, use the simple 'client' key. if not rotation_policy: client = gen_policy.get('client') logger.info(f"Using client '{client}' for profile '{profile_name}'.") req_params = gen_policy.get('request_params') return client, req_params # --- Rotation logic --- now = time.time() major_client = rotation_policy.get('major_client') refresh_client = rotation_policy.get('refresh_client') refresh_every = rotation_policy.get('refresh_every', {}) if not refresh_client or not refresh_every: return major_client, rotation_policy.get('major_client_params') should_refresh = False # Check time-based refresh refresh_minutes = refresh_every.get('minutes') last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0) if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60): should_refresh = True # Check request-count-based refresh refresh_requests = refresh_every.get('requests') request_count = self.state['profile_request_counts'].get(profile_name, 0) if refresh_requests and request_count >= refresh_requests: should_refresh = True if should_refresh: logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.") self.state['profile_last_refresh_time'][profile_name] = now self.state['profile_request_counts'][profile_name] = 0 # Reset counter return refresh_client, rotation_policy.get('refresh_client_params') else: # Not refreshing, so increment request count for this profile self.state['profile_request_counts'][profile_name] = request_count + 1 return major_client, rotation_policy.get('major_client_params') def get_next_available_profile(self, policy): """ Finds or creates an available profile based on the dynamic cooldown policy. Returns a profile name, or None if no profile is available. """ with self.lock: now = time.time() settings = policy.get('settings', {}) pm_policy = settings.get('profile_management') if not pm_policy: return None prefix = pm_policy.get('prefix') if not prefix: logger.error("Profile management policy requires 'prefix'.") return None # Determine and persist the suffix for this run to ensure profile names are stable run_suffix = self.state.get('profile_run_suffix') if not run_suffix: suffix_config = pm_policy.get('suffix') if suffix_config == 'auto': run_suffix = datetime.now().strftime('%Y%m%d%H%M') else: run_suffix = suffix_config or '' self.state['profile_run_suffix'] = run_suffix # Initialize pool size from policy if not already in state if self.state.get('profile_pool_size', 0) == 0: self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1) max_reqs = pm_policy.get('max_requests_per_profile') sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion') # Loop until a profile is found or we decide we can't find one while True: # Try to find an existing, available profile for i in range(self.state['profile_pool_size']): profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}" # Check if sleeping sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0) if now < sleep_until: continue # Still sleeping # Check if it needs to be put to sleep req_count = self.state['profile_cooldown_counts'].get(profile_name, 0) if max_reqs and req_count >= max_reqs: sleep_duration_seconds = (sleep_mins or 0) * 60 self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.") continue # Now sleeping, try next profile # This profile is available logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).") return profile_name # If we get here, no existing profile was available if pm_policy.get('auto_expand_pool'): new_profile_index = self.state['profile_pool_size'] self.state['profile_pool_size'] += 1 profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}" logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'") return profile_name else: # No available profiles and pool expansion is disabled return None def get_or_rotate_worker_profile(self, worker_id, policy): """ Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met. This is used by the 'per_worker_with_rotation' profile mode. """ with self.lock: pm_policy = policy.get('settings', {}).get('profile_management', {}) if not pm_policy: logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.") return f"error_profile_{worker_id}" prefix = pm_policy.get('prefix') if not prefix: logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.") return f"error_profile_{worker_id}" max_reqs = pm_policy.get('max_requests_per_profile') generations = self.state.setdefault('worker_profile_generations', {}) # worker_id is an int, but JSON keys must be strings worker_id_str = str(worker_id) current_gen = generations.get(worker_id_str, 0) profile_name = f"{prefix}_{worker_id}_{current_gen}" if not max_reqs: # No lifetime limit defined, so never rotate. return profile_name req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0) if req_count >= max_reqs: logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.") new_gen = current_gen + 1 generations[worker_id_str] = new_gen # The request counts for the old profile are implicitly left behind. # The new profile will start with a count of 0. profile_name = f"{prefix}_{worker_id}_{new_gen}" return profile_name def record_profile_request(self, profile_name): """Increments the request counter for a profile for the cooldown policy.""" with self.lock: if not profile_name: return counts = self.state.setdefault('profile_cooldown_counts', {}) counts[profile_name] = counts.get(profile_name, 0) + 1 def record_proxy_usage(self, proxy_url): """Records a request timestamp for a given proxy URL for statistical purposes.""" if not proxy_url: return with self.lock: now = time.time() # Use a prefix to avoid collisions with profile names or other keys tracker_key = f"proxy_{proxy_url}" self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now) def check_and_update_download_rate_limit(self, proxy_url, policy): """Checks download rate limits. Returns True if allowed, False otherwise.""" with self.lock: now = time.time() d_policy = policy.get('download_policy', {}) rate_limits = d_policy.get('rate_limits', {}) # Check per-IP limit ip_limit = rate_limits.get('per_ip') if ip_limit: tracker_key = 'download_per_ip' # Use a distinct key max_req = ip_limit.get('max_requests') period_min = ip_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning("Per-IP download rate limit reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # Check per-proxy limit proxy_limit = rate_limits.get('per_proxy') if proxy_limit and proxy_url: tracker_key = f"download_proxy_{proxy_url}" max_req = proxy_limit.get('max_requests') period_min = proxy_limit.get('per_minutes') if max_req and period_min: timestamps = self.state['rate_limit_trackers'].get(tracker_key, []) timestamps = [ts for ts in timestamps if now - ts < period_min * 60] if len(timestamps) >= max_req: logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.") return False self.state['rate_limit_trackers'][tracker_key] = timestamps # If all checks pass, record the new request timestamp for all relevant trackers if ip_limit and ip_limit.get('max_requests'): self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now) if proxy_limit and proxy_limit.get('max_requests') and proxy_url: self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now) return True def wait_for_proxy_cooldown(self, proxy_url, policy): """If a per-proxy sleep is defined, wait until the cooldown period has passed.""" with self.lock: d_policy = policy.get('download_policy', {}) sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0) if not proxy_url or not sleep_duration > 0: return last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0) elapsed = time.time() - last_finish if elapsed < sleep_duration: time_to_sleep = sleep_duration - elapsed logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.") # Interruptible sleep sleep_end_time = time.time() + time_to_sleep while time.time() < sleep_end_time: if self.shutdown_event.is_set(): logger.info("Shutdown requested during proxy cooldown sleep.") break time.sleep(0.2) def update_proxy_finish_time(self, proxy_url): """Updates the last finish time for a proxy.""" with self.lock: if not proxy_url: return self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time() def print_summary(self, policy=None): """Print a summary of the test run.""" with self.lock: # --- Cumulative Stats from State --- now = time.time() rate_trackers = self.state.get('rate_limit_trackers', {}) if rate_trackers: logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---") logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.") fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')} download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')} def print_tracker_stats(trackers, tracker_type): if not trackers: logger.info(f"No historical {tracker_type} trackers found.") return logger.info(f"Historical {tracker_type} Trackers:") for key, timestamps in sorted(trackers.items()): windows = { 'last 10 min': 600, 'last 60 min': 3600, 'last 6 hours': 21600, 'last 24 hours': 86400 } rates_str_parts = [] for name, seconds in windows.items(): count = sum(1 for ts in timestamps if now - ts <= seconds) rate_rpm = (count / seconds) * 60 if seconds > 0 else 0 rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)") # Clean up key for display display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips') logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts)) print_tracker_stats(fetch_trackers, "Fetch Request") print_tracker_stats(download_trackers, "Download Attempt") if not self.events: logger.info("\nNo new events were recorded in this session.") return duration = time.time() - self.start_time fetch_events = [e for e in self.events if e.get('type') == 'fetch'] batch_fetch_events = [e for e in self.events if e.get('type') == 'fetch_batch'] download_events = [e for e in self.events if e.get('type') not in ['fetch', 'fetch_batch']] logger.info("\n--- Test Summary (This Run) ---") logger.info(f"Total duration: {duration:.2f} seconds") # Check for batch mode stats from state if self.state.get('total_batches_processed', 0) > 0: logger.info(f"Total batches processed (cumulative): {self.state['total_batches_processed']}") logger.info(f" - Successful: {self.state['successful_batches']}") logger.info(f" - Failed: {self.state['failed_batches']}") logger.info(f"Total videos processed (cumulative): {self.state['total_videos_processed']}") else: logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}") if policy: logger.info("\n--- Test Configuration ---") settings = policy.get('settings', {}) d_policy = policy.get('download_policy', {}) if settings.get('urls_file'): logger.info(f"URL source file: {settings['urls_file']}") if settings.get('info_json_dir'): logger.info(f"Info.json source dir: {settings['info_json_dir']}") if d_policy: logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}") if d_policy.get('downloader'): logger.info(f"Downloader: {d_policy.get('downloader')}") if d_policy.get('downloader_args'): logger.info(f"Downloader args: {d_policy.get('downloader_args')}") if d_policy.get('pause_before_download_seconds'): logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s") if d_policy.get('sleep_between_formats'): sleep_cfg = d_policy.get('sleep_between_formats') logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s") if fetch_events: total_fetches = len(fetch_events) successful_fetches = sum(1 for e in fetch_events if e['success']) cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled') failed_fetches = total_fetches - successful_fetches - cancelled_fetches logger.info("\n--- Fetch Summary (This Run) ---") logger.info(f"Total info.json fetch attempts: {total_fetches}") logger.info(f" - Successful: {successful_fetches}") logger.info(f" - Failed: {failed_fetches}") if cancelled_fetches > 0: logger.info(f" - Cancelled: {cancelled_fetches}") completed_fetches = successful_fetches + failed_fetches if completed_fetches > 0: success_rate = (successful_fetches / completed_fetches) * 100 logger.info(f"Success rate (of completed): {success_rate:.2f}%") elif total_fetches > 0: logger.info("Success rate: N/A (no tasks completed)") if duration > 1 and total_fetches > 0: rpm = (total_fetches / duration) * 60 logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute") if failed_fetches > 0: error_counts = collections.Counter( e.get('error_type', 'Unknown') for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled' ) logger.info("Failure breakdown:") for error_type, count in sorted(error_counts.items()): logger.info(f" - {error_type}: {count}") profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile')) if profile_counts: logger.info("Requests per profile:") for profile, count in sorted(profile_counts.items()): logger.info(f" - {profile}: {count}") proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url')) if proxy_counts: logger.info("Requests per proxy:") for proxy, count in sorted(proxy_counts.items()): logger.info(f" - {proxy}: {count}") if batch_fetch_events: total_batches = len(batch_fetch_events) successful_batches = sum(1 for e in batch_fetch_events if e['success']) failed_batches = total_batches - successful_batches total_videos_this_run = sum(e.get('video_count', 0) for e in batch_fetch_events) logger.info("\n--- Batch Fetch Summary (This Run) ---") logger.info(f"Total batches processed: {total_batches}") logger.info(f"Total videos processed: {total_videos_this_run}") logger.info(f" - Successful batches: {successful_batches}") logger.info(f" - Failed batches: {failed_batches}") profile_counts = collections.Counter(e.get('profile') for e in batch_fetch_events if e.get('profile')) if profile_counts: logger.info("Batches per profile:") for profile, count in sorted(profile_counts.items()): logger.info(f" - {profile}: {count}") proxy_counts = collections.Counter(e.get('proxy_url') for e in batch_fetch_events if e.get('proxy_url')) if proxy_counts: logger.info("Batches per proxy:") for proxy, count in sorted(proxy_counts.items()): logger.info(f" - {proxy}: {count}") if download_events: total_attempts = len(download_events) successes = sum(1 for e in download_events if e['success']) cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled') failures = total_attempts - successes - cancelled # --- Profile Association for Download Events --- download_profiles = [e.get('profile') for e in download_events] # For download_only mode, we might need to fall back to regex extraction # if the profile wasn't passed down (e.g., no profile grouping). profile_regex = None if policy: settings = policy.get('settings', {}) if settings.get('mode') == 'download_only': profile_regex = settings.get('profile_extraction_regex') if profile_regex: for i, e in enumerate(download_events): if not download_profiles[i]: # If profile wasn't set in the event path = Path(e.get('path', '')) match = re.search(profile_regex, path.name) if match and match.groups(): download_profiles[i] = match.group(1) # Replace any remaining Nones with 'unknown_profile' download_profiles = [p or 'unknown_profile' for p in download_profiles] num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile')) logger.info("\n--- Download Summary (This Run) ---") if policy: workers = policy.get('execution_control', {}).get('workers', 'N/A') logger.info(f"Workers configured: {workers}") logger.info(f"Profiles utilized for downloads: {num_profiles_used}") logger.info(f"Total download attempts: {total_attempts}") logger.info(f" - Successful: {successes}") logger.info(f" - Failed: {failures}") if cancelled > 0: logger.info(f" - Cancelled: {cancelled}") completed_downloads = successes + failures if completed_downloads > 0: success_rate = (successes / completed_downloads) * 100 logger.info(f"Success rate (of completed): {success_rate:.2f}%") elif total_attempts > 0: logger.info("Success rate: N/A (no tasks completed)") duration_hours = duration / 3600.0 if duration > 1 and total_attempts > 0: dpm = (total_attempts / duration) * 60 logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute") total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success']) if total_bytes > 0: logger.info(f"Total data downloaded: {sp_utils.format_size(total_bytes)}") if failures > 0: error_counts = collections.Counter( e.get('error_type', 'Unknown') for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled' ) logger.info("Failure breakdown:") for error_type, count in sorted(error_counts.items()): logger.info(f" - {error_type}: {count}") # Add profile to each download event for easier counting for i, e in enumerate(download_events): e['profile'] = download_profiles[i] profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile')) if profile_counts: logger.info("Downloads per profile:") for profile, count in sorted(profile_counts.items()): rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url')) if proxy_counts: logger.info("Downloads per proxy:") for proxy, count in sorted(proxy_counts.items()): rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0 logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)") logger.info("--------------------")