795 lines
39 KiB
Python

import collections
import collections.abc
import json
import logging
import re
import threading
import time
from datetime import datetime
from pathlib import Path
from . import utils as sp_utils
logger = logging.getLogger(__name__)
class StateManager:
"""Tracks statistics, manages rate limits, and persists state across runs."""
def __init__(self, policy_name, disable_log_writing=False, shutdown_event=None):
self.disable_log_writing = disable_log_writing
self.state_file_path = Path(f"{policy_name}_state.json")
self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
self.lock = threading.RLock()
self.start_time = time.time()
self.shutdown_event = shutdown_event or threading.Event()
self.events = []
self.state = {
'global_request_count': 0,
'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]}
'profile_request_counts': {}, # for client rotation
'profile_last_refresh_time': {}, # for client rotation
'proxy_last_finish_time': {}, # for per-proxy sleep
'processed_files': [], # For continuous download_only mode
# For dynamic profile cooldown strategy
'profile_cooldown_counts': {},
'profile_cooldown_sleep_until': {},
'profile_pool_size': 0,
'profile_run_suffix': None,
'worker_profile_generations': {},
'last_url_index': 0,
# For batch modes
'total_batches_processed': 0,
'successful_batches': 0,
'failed_batches': 0,
'total_videos_processed': 0,
}
self.stats_file_handle = None
self._load_state()
self.print_historical_summary()
self._open_stats_log()
def _load_state(self):
if self.disable_log_writing:
logger.info("Log writing is disabled. State will not be loaded from disk.")
return
if not self.state_file_path.exists():
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
return
try:
with open(self.state_file_path, 'r', encoding='utf-8') as f:
self.state = json.load(f)
# Ensure keys exist
self.state.setdefault('global_request_count', 0)
self.state.setdefault('rate_limit_trackers', {})
self.state.setdefault('profile_request_counts', {})
self.state.setdefault('profile_last_refresh_time', {})
self.state.setdefault('proxy_last_finish_time', {})
self.state.setdefault('processed_files', [])
# For dynamic profile cooldown strategy
self.state.setdefault('profile_cooldown_counts', {})
self.state.setdefault('profile_cooldown_sleep_until', {})
self.state.setdefault('profile_pool_size', 0)
self.state.setdefault('profile_run_suffix', None)
self.state.setdefault('worker_profile_generations', {})
self.state.setdefault('last_url_index', 0)
# For batch modes
self.state.setdefault('total_batches_processed', 0)
self.state.setdefault('successful_batches', 0)
self.state.setdefault('failed_batches', 0)
self.state.setdefault('total_videos_processed', 0)
logger.info(f"Loaded state from {self.state_file_path}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
def _save_state(self):
if self.disable_log_writing:
return
with self.lock:
try:
with open(self.state_file_path, 'w', encoding='utf-8') as f:
json.dump(self.state, f, indent=2)
logger.info(f"Saved state to {self.state_file_path}")
except IOError as e:
logger.error(f"Could not save state to {self.state_file_path}: {e}")
def _open_stats_log(self):
if self.disable_log_writing:
return
try:
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
except IOError as e:
logger.error(f"Could not open stats file {self.stats_file_path}: {e}")
def close(self):
"""Saves state and closes file handles."""
self._save_state()
if self.stats_file_handle:
self.stats_file_handle.close()
self.stats_file_handle = None
def mark_file_as_processed(self, file_path):
"""Adds a file path to the list of processed files in the state."""
with self.lock:
# Using a list and checking for existence is fine for moderate numbers of files.
# A set isn't JSON serializable.
processed = self.state.setdefault('processed_files', [])
file_str = str(file_path)
if file_str not in processed:
processed.append(file_str)
def get_last_url_index(self):
"""Gets the last URL index to start from."""
with self.lock:
return self.state.get('last_url_index', 0)
def get_next_url_batch(self, count, urls_list):
"""Gets the next batch of URLs to process, updating the state."""
with self.lock:
start_index = self.state.get('last_url_index', 0)
if start_index >= len(urls_list):
return [], start_index # No more URLs
end_index = start_index + count
batch = urls_list[start_index:end_index]
# Update state with the index of the *next* URL to be processed.
self.state['last_url_index'] = end_index
return batch, start_index
def update_last_url_index(self, index, force=False):
"""Updates the last processed URL index in the state.
Args:
index: The index of the *next* URL to process.
force: If True, sets the index regardless of the current value.
"""
with self.lock:
if force or index > self.state.get('last_url_index', 0):
self.state['last_url_index'] = index
def get_processed_files(self):
"""Returns a set of file paths that have been processed."""
with self.lock:
return set(self.state.get('processed_files', []))
def record_batch_result(self, success, video_count, profile_name=None):
with self.lock:
self.state['total_batches_processed'] = self.state.get('total_batches_processed', 0) + 1
self.state['total_videos_processed'] = self.state.get('total_videos_processed', 0) + video_count
if success:
self.state['successful_batches'] = self.state.get('successful_batches', 0) + 1
else:
self.state['failed_batches'] = self.state.get('failed_batches', 0) + 1
# Print live counter
total = self.state['total_batches_processed']
ok = self.state['successful_batches']
fail = self.state['failed_batches']
profile_log = f" [{profile_name}]" if profile_name else ""
logger.info(f"Batch #{total} complete.{profile_log} (Total OK: {ok}, Total Fail: {fail})")
def print_historical_summary(self):
"""Prints a summary based on the state loaded from disk, before new events."""
with self.lock:
now = time.time()
rate_trackers = self.state.get('rate_limit_trackers', {})
total_requests = self.state.get('global_request_count', 0)
if not rate_trackers and not total_requests:
logger.info("No historical data found in state file.")
return
logger.info("\n--- Summary From Previous Runs ---")
logger.info(f"Total info.json requests (all previous runs): {total_requests}")
if rate_trackers:
for key, timestamps in sorted(rate_trackers.items()):
# Time windows in seconds
windows = {
'last 10 min': 600,
'last 60 min': 3600,
'last 6 hours': 21600,
'last 24 hours': 86400
}
rates_str_parts = []
for name, seconds in windows.items():
count = sum(1 for ts in timestamps if now - ts <= seconds)
# Calculate rate in requests per minute
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)")
logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts))
logger.info("------------------------------------")
def log_event(self, event_data):
with self.lock:
event_data['timestamp'] = datetime.now().isoformat()
self.events.append(event_data)
if self.stats_file_handle:
self.stats_file_handle.write(json.dumps(event_data) + '\n')
self.stats_file_handle.flush()
def get_request_count(self):
with self.lock:
return self.state.get('global_request_count', 0)
def increment_request_count(self):
with self.lock:
self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1
def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None):
"""
Checks if a cumulative error rate has been exceeded.
If error_type is None, checks for any failure.
Returns the number of errors found if the threshold is met, otherwise 0.
"""
with self.lock:
now = time.time()
window_seconds = per_minutes * 60
if error_type:
recent_errors = [
e for e in self.events
if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
else: # Generic failure check
recent_errors = [
e for e in self.events
# Only count failures that are not explicitly tolerated
if not e.get('success') and not e.get('is_tolerated_error') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
if len(recent_errors) >= max_errors:
return len(recent_errors)
return 0
def check_quality_degradation_rate(self, max_triggers, per_minutes):
"""
Checks if the quality degradation trigger rate has been exceeded.
Returns the number of triggers found if the threshold is met, otherwise 0.
"""
with self.lock:
now = time.time()
window_seconds = per_minutes * 60
recent_triggers = [
e for e in self.events
if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
]
if len(recent_triggers) >= max_triggers:
return len(recent_triggers)
return 0
def check_and_update_rate_limit(self, profile_name, policy):
"""
Checks if a request is allowed based on policy rate limits.
If allowed, updates the internal state. Returns True if allowed, False otherwise.
"""
with self.lock:
now = time.time()
gen_policy = policy.get('info_json_generation_policy', {})
rate_limits = gen_policy.get('rate_limits', {})
# Check per-IP limit
ip_limit = rate_limits.get('per_ip')
if ip_limit:
tracker_key = 'per_ip'
max_req = ip_limit.get('max_requests')
period_min = ip_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
# Filter out old timestamps
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning("Per-IP rate limit reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# Check per-profile limit
profile_limit = rate_limits.get('per_profile')
if profile_limit and profile_name:
tracker_key = f"profile_{profile_name}"
max_req = profile_limit.get('max_requests')
period_min = profile_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# If all checks pass, record the new request timestamp for all relevant trackers
if ip_limit and ip_limit.get('max_requests'):
self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now)
if profile_limit and profile_limit.get('max_requests') and profile_name:
self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now)
return True
def get_client_for_request(self, profile_name, gen_policy):
"""
Determines which client to use based on the client_rotation_policy.
Returns a tuple: (client_name, request_params_dict).
"""
with self.lock:
rotation_policy = gen_policy.get('client_rotation_policy')
# If no rotation policy, use the simple 'client' key.
if not rotation_policy:
client = gen_policy.get('client')
logger.info(f"Using client '{client}' for profile '{profile_name}'.")
req_params = gen_policy.get('request_params')
return client, req_params
# --- Rotation logic ---
now = time.time()
major_client = rotation_policy.get('major_client')
refresh_client = rotation_policy.get('refresh_client')
refresh_every = rotation_policy.get('refresh_every', {})
if not refresh_client or not refresh_every:
return major_client, rotation_policy.get('major_client_params')
should_refresh = False
# Check time-based refresh
refresh_minutes = refresh_every.get('minutes')
last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0)
if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60):
should_refresh = True
# Check request-count-based refresh
refresh_requests = refresh_every.get('requests')
request_count = self.state['profile_request_counts'].get(profile_name, 0)
if refresh_requests and request_count >= refresh_requests:
should_refresh = True
if should_refresh:
logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.")
self.state['profile_last_refresh_time'][profile_name] = now
self.state['profile_request_counts'][profile_name] = 0 # Reset counter
return refresh_client, rotation_policy.get('refresh_client_params')
else:
# Not refreshing, so increment request count for this profile
self.state['profile_request_counts'][profile_name] = request_count + 1
return major_client, rotation_policy.get('major_client_params')
def get_next_available_profile(self, policy):
"""
Finds or creates an available profile based on the dynamic cooldown policy.
Returns a profile name, or None if no profile is available.
"""
with self.lock:
now = time.time()
settings = policy.get('settings', {})
pm_policy = settings.get('profile_management')
if not pm_policy:
return None
prefix = pm_policy.get('prefix')
if not prefix:
logger.error("Profile management policy requires 'prefix'.")
return None
# Determine and persist the suffix for this run to ensure profile names are stable
run_suffix = self.state.get('profile_run_suffix')
if not run_suffix:
suffix_config = pm_policy.get('suffix')
if suffix_config == 'auto':
run_suffix = datetime.now().strftime('%Y%m%d%H%M')
else:
run_suffix = suffix_config or ''
self.state['profile_run_suffix'] = run_suffix
# Initialize pool size from policy if not already in state
if self.state.get('profile_pool_size', 0) == 0:
self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1)
max_reqs = pm_policy.get('max_requests_per_profile')
sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion')
# Loop until a profile is found or we decide we can't find one
while True:
# Try to find an existing, available profile
for i in range(self.state['profile_pool_size']):
profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}"
# Check if sleeping
sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0)
if now < sleep_until:
continue # Still sleeping
# Check if it needs to be put to sleep
req_count = self.state['profile_cooldown_counts'].get(profile_name, 0)
if max_reqs and req_count >= max_reqs:
sleep_duration_seconds = (sleep_mins or 0) * 60
self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds
self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time
logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.")
continue # Now sleeping, try next profile
# This profile is available
logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).")
return profile_name
# If we get here, no existing profile was available
if pm_policy.get('auto_expand_pool'):
new_profile_index = self.state['profile_pool_size']
self.state['profile_pool_size'] += 1
profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}"
logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'")
return profile_name
else:
# No available profiles and pool expansion is disabled
return None
def get_or_rotate_worker_profile(self, worker_id, policy):
"""
Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met.
This is used by the 'per_worker_with_rotation' profile mode.
"""
with self.lock:
pm_policy = policy.get('settings', {}).get('profile_management', {})
if not pm_policy:
logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.")
return f"error_profile_{worker_id}"
prefix = pm_policy.get('prefix')
if not prefix:
logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.")
return f"error_profile_{worker_id}"
max_reqs = pm_policy.get('max_requests_per_profile')
generations = self.state.setdefault('worker_profile_generations', {})
# worker_id is an int, but JSON keys must be strings
worker_id_str = str(worker_id)
current_gen = generations.get(worker_id_str, 0)
profile_name = f"{prefix}_{worker_id}_{current_gen}"
if not max_reqs: # No lifetime limit defined, so never rotate.
return profile_name
req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0)
if req_count >= max_reqs:
logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.")
new_gen = current_gen + 1
generations[worker_id_str] = new_gen
# The request counts for the old profile are implicitly left behind.
# The new profile will start with a count of 0.
profile_name = f"{prefix}_{worker_id}_{new_gen}"
return profile_name
def record_profile_request(self, profile_name):
"""Increments the request counter for a profile for the cooldown policy."""
with self.lock:
if not profile_name:
return
counts = self.state.setdefault('profile_cooldown_counts', {})
counts[profile_name] = counts.get(profile_name, 0) + 1
def record_proxy_usage(self, proxy_url):
"""Records a request timestamp for a given proxy URL for statistical purposes."""
if not proxy_url:
return
with self.lock:
now = time.time()
# Use a prefix to avoid collisions with profile names or other keys
tracker_key = f"proxy_{proxy_url}"
self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now)
def check_and_update_download_rate_limit(self, proxy_url, policy):
"""Checks download rate limits. Returns True if allowed, False otherwise."""
with self.lock:
now = time.time()
d_policy = policy.get('download_policy', {})
rate_limits = d_policy.get('rate_limits', {})
# Check per-IP limit
ip_limit = rate_limits.get('per_ip')
if ip_limit:
tracker_key = 'download_per_ip' # Use a distinct key
max_req = ip_limit.get('max_requests')
period_min = ip_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning("Per-IP download rate limit reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# Check per-proxy limit
proxy_limit = rate_limits.get('per_proxy')
if proxy_limit and proxy_url:
tracker_key = f"download_proxy_{proxy_url}"
max_req = proxy_limit.get('max_requests')
period_min = proxy_limit.get('per_minutes')
if max_req and period_min:
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
if len(timestamps) >= max_req:
logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.")
return False
self.state['rate_limit_trackers'][tracker_key] = timestamps
# If all checks pass, record the new request timestamp for all relevant trackers
if ip_limit and ip_limit.get('max_requests'):
self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now)
if proxy_limit and proxy_limit.get('max_requests') and proxy_url:
self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now)
return True
def wait_for_proxy_cooldown(self, proxy_url, policy):
"""If a per-proxy sleep is defined, wait until the cooldown period has passed."""
with self.lock:
d_policy = policy.get('download_policy', {})
sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0)
if not proxy_url or not sleep_duration > 0:
return
last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0)
elapsed = time.time() - last_finish
if elapsed < sleep_duration:
time_to_sleep = sleep_duration - elapsed
logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.")
# Interruptible sleep
sleep_end_time = time.time() + time_to_sleep
while time.time() < sleep_end_time:
if self.shutdown_event.is_set():
logger.info("Shutdown requested during proxy cooldown sleep.")
break
time.sleep(0.2)
def update_proxy_finish_time(self, proxy_url):
"""Updates the last finish time for a proxy."""
with self.lock:
if not proxy_url:
return
self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time()
def print_summary(self, policy=None):
"""Print a summary of the test run."""
with self.lock:
# --- Cumulative Stats from State ---
now = time.time()
rate_trackers = self.state.get('rate_limit_trackers', {})
if rate_trackers:
logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---")
logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.")
fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')}
download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')}
def print_tracker_stats(trackers, tracker_type):
if not trackers:
logger.info(f"No historical {tracker_type} trackers found.")
return
logger.info(f"Historical {tracker_type} Trackers:")
for key, timestamps in sorted(trackers.items()):
windows = {
'last 10 min': 600, 'last 60 min': 3600,
'last 6 hours': 21600, 'last 24 hours': 86400
}
rates_str_parts = []
for name, seconds in windows.items():
count = sum(1 for ts in timestamps if now - ts <= seconds)
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)")
# Clean up key for display
display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips')
logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts))
print_tracker_stats(fetch_trackers, "Fetch Request")
print_tracker_stats(download_trackers, "Download Attempt")
if not self.events:
logger.info("\nNo new events were recorded in this session.")
return
duration = time.time() - self.start_time
fetch_events = [e for e in self.events if e.get('type') == 'fetch']
batch_fetch_events = [e for e in self.events if e.get('type') == 'fetch_batch']
download_events = [e for e in self.events if e.get('type') not in ['fetch', 'fetch_batch']]
logger.info("\n--- Test Summary (This Run) ---")
logger.info(f"Total duration: {duration:.2f} seconds")
# Check for batch mode stats from state
if self.state.get('total_batches_processed', 0) > 0:
logger.info(f"Total batches processed (cumulative): {self.state['total_batches_processed']}")
logger.info(f" - Successful: {self.state['successful_batches']}")
logger.info(f" - Failed: {self.state['failed_batches']}")
logger.info(f"Total videos processed (cumulative): {self.state['total_videos_processed']}")
else:
logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}")
if policy:
logger.info("\n--- Test Configuration ---")
settings = policy.get('settings', {})
d_policy = policy.get('download_policy', {})
if settings.get('urls_file'):
logger.info(f"URL source file: {settings['urls_file']}")
if settings.get('info_json_dir'):
logger.info(f"Info.json source dir: {settings['info_json_dir']}")
if d_policy:
logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}")
if d_policy.get('downloader'):
logger.info(f"Downloader: {d_policy.get('downloader')}")
if d_policy.get('downloader_args'):
logger.info(f"Downloader args: {d_policy.get('downloader_args')}")
if d_policy.get('pause_before_download_seconds'):
logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s")
if d_policy.get('sleep_between_formats'):
sleep_cfg = d_policy.get('sleep_between_formats')
logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s")
if fetch_events:
total_fetches = len(fetch_events)
successful_fetches = sum(1 for e in fetch_events if e['success'])
cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled')
failed_fetches = total_fetches - successful_fetches - cancelled_fetches
logger.info("\n--- Fetch Summary (This Run) ---")
logger.info(f"Total info.json fetch attempts: {total_fetches}")
logger.info(f" - Successful: {successful_fetches}")
logger.info(f" - Failed: {failed_fetches}")
if cancelled_fetches > 0:
logger.info(f" - Cancelled: {cancelled_fetches}")
completed_fetches = successful_fetches + failed_fetches
if completed_fetches > 0:
success_rate = (successful_fetches / completed_fetches) * 100
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
elif total_fetches > 0:
logger.info("Success rate: N/A (no tasks completed)")
if duration > 1 and total_fetches > 0:
rpm = (total_fetches / duration) * 60
logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute")
if failed_fetches > 0:
error_counts = collections.Counter(
e.get('error_type', 'Unknown')
for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled'
)
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile'))
if profile_counts:
logger.info("Requests per profile:")
for profile, count in sorted(profile_counts.items()):
logger.info(f" - {profile}: {count}")
proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Requests per proxy:")
for proxy, count in sorted(proxy_counts.items()):
logger.info(f" - {proxy}: {count}")
if batch_fetch_events:
total_batches = len(batch_fetch_events)
successful_batches = sum(1 for e in batch_fetch_events if e['success'])
failed_batches = total_batches - successful_batches
total_videos_this_run = sum(e.get('video_count', 0) for e in batch_fetch_events)
logger.info("\n--- Batch Fetch Summary (This Run) ---")
logger.info(f"Total batches processed: {total_batches}")
logger.info(f"Total videos processed: {total_videos_this_run}")
logger.info(f" - Successful batches: {successful_batches}")
logger.info(f" - Failed batches: {failed_batches}")
profile_counts = collections.Counter(e.get('profile') for e in batch_fetch_events if e.get('profile'))
if profile_counts:
logger.info("Batches per profile:")
for profile, count in sorted(profile_counts.items()):
logger.info(f" - {profile}: {count}")
proxy_counts = collections.Counter(e.get('proxy_url') for e in batch_fetch_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Batches per proxy:")
for proxy, count in sorted(proxy_counts.items()):
logger.info(f" - {proxy}: {count}")
if download_events:
total_attempts = len(download_events)
successes = sum(1 for e in download_events if e['success'])
cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled')
failures = total_attempts - successes - cancelled
# --- Profile Association for Download Events ---
download_profiles = [e.get('profile') for e in download_events]
# For download_only mode, we might need to fall back to regex extraction
# if the profile wasn't passed down (e.g., no profile grouping).
profile_regex = None
if policy:
settings = policy.get('settings', {})
if settings.get('mode') == 'download_only':
profile_regex = settings.get('profile_extraction_regex')
if profile_regex:
for i, e in enumerate(download_events):
if not download_profiles[i]: # If profile wasn't set in the event
path = Path(e.get('path', ''))
match = re.search(profile_regex, path.name)
if match and match.groups():
download_profiles[i] = match.group(1)
# Replace any remaining Nones with 'unknown_profile'
download_profiles = [p or 'unknown_profile' for p in download_profiles]
num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile'))
logger.info("\n--- Download Summary (This Run) ---")
if policy:
workers = policy.get('execution_control', {}).get('workers', 'N/A')
logger.info(f"Workers configured: {workers}")
logger.info(f"Profiles utilized for downloads: {num_profiles_used}")
logger.info(f"Total download attempts: {total_attempts}")
logger.info(f" - Successful: {successes}")
logger.info(f" - Failed: {failures}")
if cancelled > 0:
logger.info(f" - Cancelled: {cancelled}")
completed_downloads = successes + failures
if completed_downloads > 0:
success_rate = (successes / completed_downloads) * 100
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
elif total_attempts > 0:
logger.info("Success rate: N/A (no tasks completed)")
duration_hours = duration / 3600.0
if duration > 1 and total_attempts > 0:
dpm = (total_attempts / duration) * 60
logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute")
total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success'])
if total_bytes > 0:
logger.info(f"Total data downloaded: {sp_utils.format_size(total_bytes)}")
if failures > 0:
error_counts = collections.Counter(
e.get('error_type', 'Unknown')
for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled'
)
logger.info("Failure breakdown:")
for error_type, count in sorted(error_counts.items()):
logger.info(f" - {error_type}: {count}")
# Add profile to each download event for easier counting
for i, e in enumerate(download_events):
e['profile'] = download_profiles[i]
profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile'))
if profile_counts:
logger.info("Downloads per profile:")
for profile, count in sorted(profile_counts.items()):
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url'))
if proxy_counts:
logger.info("Downloads per proxy:")
for proxy, count in sorted(proxy_counts.items()):
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
logger.info("--------------------")