795 lines
39 KiB
Python
795 lines
39 KiB
Python
import collections
|
|
import collections.abc
|
|
import json
|
|
import logging
|
|
import re
|
|
import threading
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from . import utils as sp_utils
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StateManager:
|
|
"""Tracks statistics, manages rate limits, and persists state across runs."""
|
|
def __init__(self, policy_name, disable_log_writing=False, shutdown_event=None):
|
|
self.disable_log_writing = disable_log_writing
|
|
self.state_file_path = Path(f"{policy_name}_state.json")
|
|
self.stats_file_path = Path(f"{policy_name}_stats.jsonl")
|
|
self.lock = threading.RLock()
|
|
self.start_time = time.time()
|
|
self.shutdown_event = shutdown_event or threading.Event()
|
|
self.events = []
|
|
self.state = {
|
|
'global_request_count': 0,
|
|
'rate_limit_trackers': {}, # e.g., {'per_ip': [ts1, ts2], 'profile_foo': [ts3, ts4]}
|
|
'profile_request_counts': {}, # for client rotation
|
|
'profile_last_refresh_time': {}, # for client rotation
|
|
'proxy_last_finish_time': {}, # for per-proxy sleep
|
|
'processed_files': [], # For continuous download_only mode
|
|
# For dynamic profile cooldown strategy
|
|
'profile_cooldown_counts': {},
|
|
'profile_cooldown_sleep_until': {},
|
|
'profile_pool_size': 0,
|
|
'profile_run_suffix': None,
|
|
'worker_profile_generations': {},
|
|
'last_url_index': 0,
|
|
# For batch modes
|
|
'total_batches_processed': 0,
|
|
'successful_batches': 0,
|
|
'failed_batches': 0,
|
|
'total_videos_processed': 0,
|
|
}
|
|
self.stats_file_handle = None
|
|
self._load_state()
|
|
self.print_historical_summary()
|
|
self._open_stats_log()
|
|
|
|
def _load_state(self):
|
|
if self.disable_log_writing:
|
|
logger.info("Log writing is disabled. State will not be loaded from disk.")
|
|
return
|
|
if not self.state_file_path.exists():
|
|
logger.info(f"State file not found at '{self.state_file_path}', starting fresh.")
|
|
return
|
|
try:
|
|
with open(self.state_file_path, 'r', encoding='utf-8') as f:
|
|
self.state = json.load(f)
|
|
# Ensure keys exist
|
|
self.state.setdefault('global_request_count', 0)
|
|
self.state.setdefault('rate_limit_trackers', {})
|
|
self.state.setdefault('profile_request_counts', {})
|
|
self.state.setdefault('profile_last_refresh_time', {})
|
|
self.state.setdefault('proxy_last_finish_time', {})
|
|
self.state.setdefault('processed_files', [])
|
|
# For dynamic profile cooldown strategy
|
|
self.state.setdefault('profile_cooldown_counts', {})
|
|
self.state.setdefault('profile_cooldown_sleep_until', {})
|
|
self.state.setdefault('profile_pool_size', 0)
|
|
self.state.setdefault('profile_run_suffix', None)
|
|
self.state.setdefault('worker_profile_generations', {})
|
|
self.state.setdefault('last_url_index', 0)
|
|
# For batch modes
|
|
self.state.setdefault('total_batches_processed', 0)
|
|
self.state.setdefault('successful_batches', 0)
|
|
self.state.setdefault('failed_batches', 0)
|
|
self.state.setdefault('total_videos_processed', 0)
|
|
logger.info(f"Loaded state from {self.state_file_path}")
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Could not load or parse state file {self.state_file_path}: {e}. Starting fresh.")
|
|
|
|
def _save_state(self):
|
|
if self.disable_log_writing:
|
|
return
|
|
with self.lock:
|
|
try:
|
|
with open(self.state_file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(self.state, f, indent=2)
|
|
logger.info(f"Saved state to {self.state_file_path}")
|
|
except IOError as e:
|
|
logger.error(f"Could not save state to {self.state_file_path}: {e}")
|
|
|
|
def _open_stats_log(self):
|
|
if self.disable_log_writing:
|
|
return
|
|
try:
|
|
self.stats_file_handle = open(self.stats_file_path, 'a', encoding='utf-8')
|
|
except IOError as e:
|
|
logger.error(f"Could not open stats file {self.stats_file_path}: {e}")
|
|
|
|
def close(self):
|
|
"""Saves state and closes file handles."""
|
|
self._save_state()
|
|
if self.stats_file_handle:
|
|
self.stats_file_handle.close()
|
|
self.stats_file_handle = None
|
|
|
|
def mark_file_as_processed(self, file_path):
|
|
"""Adds a file path to the list of processed files in the state."""
|
|
with self.lock:
|
|
# Using a list and checking for existence is fine for moderate numbers of files.
|
|
# A set isn't JSON serializable.
|
|
processed = self.state.setdefault('processed_files', [])
|
|
file_str = str(file_path)
|
|
if file_str not in processed:
|
|
processed.append(file_str)
|
|
|
|
def get_last_url_index(self):
|
|
"""Gets the last URL index to start from."""
|
|
with self.lock:
|
|
return self.state.get('last_url_index', 0)
|
|
|
|
def get_next_url_batch(self, count, urls_list):
|
|
"""Gets the next batch of URLs to process, updating the state."""
|
|
with self.lock:
|
|
start_index = self.state.get('last_url_index', 0)
|
|
if start_index >= len(urls_list):
|
|
return [], start_index # No more URLs
|
|
|
|
end_index = start_index + count
|
|
batch = urls_list[start_index:end_index]
|
|
|
|
# Update state with the index of the *next* URL to be processed.
|
|
self.state['last_url_index'] = end_index
|
|
return batch, start_index
|
|
|
|
def update_last_url_index(self, index, force=False):
|
|
"""Updates the last processed URL index in the state.
|
|
|
|
Args:
|
|
index: The index of the *next* URL to process.
|
|
force: If True, sets the index regardless of the current value.
|
|
"""
|
|
with self.lock:
|
|
if force or index > self.state.get('last_url_index', 0):
|
|
self.state['last_url_index'] = index
|
|
|
|
def get_processed_files(self):
|
|
"""Returns a set of file paths that have been processed."""
|
|
with self.lock:
|
|
return set(self.state.get('processed_files', []))
|
|
|
|
def record_batch_result(self, success, video_count, profile_name=None):
|
|
with self.lock:
|
|
self.state['total_batches_processed'] = self.state.get('total_batches_processed', 0) + 1
|
|
self.state['total_videos_processed'] = self.state.get('total_videos_processed', 0) + video_count
|
|
if success:
|
|
self.state['successful_batches'] = self.state.get('successful_batches', 0) + 1
|
|
else:
|
|
self.state['failed_batches'] = self.state.get('failed_batches', 0) + 1
|
|
|
|
# Print live counter
|
|
total = self.state['total_batches_processed']
|
|
ok = self.state['successful_batches']
|
|
fail = self.state['failed_batches']
|
|
profile_log = f" [{profile_name}]" if profile_name else ""
|
|
logger.info(f"Batch #{total} complete.{profile_log} (Total OK: {ok}, Total Fail: {fail})")
|
|
|
|
def print_historical_summary(self):
|
|
"""Prints a summary based on the state loaded from disk, before new events."""
|
|
with self.lock:
|
|
now = time.time()
|
|
rate_trackers = self.state.get('rate_limit_trackers', {})
|
|
total_requests = self.state.get('global_request_count', 0)
|
|
|
|
if not rate_trackers and not total_requests:
|
|
logger.info("No historical data found in state file.")
|
|
return
|
|
|
|
logger.info("\n--- Summary From Previous Runs ---")
|
|
logger.info(f"Total info.json requests (all previous runs): {total_requests}")
|
|
|
|
if rate_trackers:
|
|
for key, timestamps in sorted(rate_trackers.items()):
|
|
# Time windows in seconds
|
|
windows = {
|
|
'last 10 min': 600,
|
|
'last 60 min': 3600,
|
|
'last 6 hours': 21600,
|
|
'last 24 hours': 86400
|
|
}
|
|
|
|
rates_str_parts = []
|
|
for name, seconds in windows.items():
|
|
count = sum(1 for ts in timestamps if now - ts <= seconds)
|
|
# Calculate rate in requests per minute
|
|
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
|
|
rates_str_parts.append(f"{count} req in {name} ({rate_rpm:.2f} rpm)")
|
|
|
|
logger.info(f"Tracker '{key}': " + ", ".join(rates_str_parts))
|
|
logger.info("------------------------------------")
|
|
|
|
def log_event(self, event_data):
|
|
with self.lock:
|
|
event_data['timestamp'] = datetime.now().isoformat()
|
|
self.events.append(event_data)
|
|
if self.stats_file_handle:
|
|
self.stats_file_handle.write(json.dumps(event_data) + '\n')
|
|
self.stats_file_handle.flush()
|
|
|
|
def get_request_count(self):
|
|
with self.lock:
|
|
return self.state.get('global_request_count', 0)
|
|
|
|
def increment_request_count(self):
|
|
with self.lock:
|
|
self.state['global_request_count'] = self.state.get('global_request_count', 0) + 1
|
|
|
|
def check_cumulative_error_rate(self, max_errors, per_minutes, error_type=None):
|
|
"""
|
|
Checks if a cumulative error rate has been exceeded.
|
|
If error_type is None, checks for any failure.
|
|
Returns the number of errors found if the threshold is met, otherwise 0.
|
|
"""
|
|
with self.lock:
|
|
now = time.time()
|
|
window_seconds = per_minutes * 60
|
|
|
|
if error_type:
|
|
recent_errors = [
|
|
e for e in self.events
|
|
if e.get('error_type') == error_type and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
|
|
]
|
|
else: # Generic failure check
|
|
recent_errors = [
|
|
e for e in self.events
|
|
# Only count failures that are not explicitly tolerated
|
|
if not e.get('success') and not e.get('is_tolerated_error') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
|
|
]
|
|
|
|
if len(recent_errors) >= max_errors:
|
|
return len(recent_errors)
|
|
return 0
|
|
|
|
def check_quality_degradation_rate(self, max_triggers, per_minutes):
|
|
"""
|
|
Checks if the quality degradation trigger rate has been exceeded.
|
|
Returns the number of triggers found if the threshold is met, otherwise 0.
|
|
"""
|
|
with self.lock:
|
|
now = time.time()
|
|
window_seconds = per_minutes * 60
|
|
|
|
recent_triggers = [
|
|
e for e in self.events
|
|
if e.get('quality_degradation_trigger') and (now - datetime.fromisoformat(e['timestamp']).timestamp()) <= window_seconds
|
|
]
|
|
|
|
if len(recent_triggers) >= max_triggers:
|
|
return len(recent_triggers)
|
|
return 0
|
|
|
|
def check_and_update_rate_limit(self, profile_name, policy):
|
|
"""
|
|
Checks if a request is allowed based on policy rate limits.
|
|
If allowed, updates the internal state. Returns True if allowed, False otherwise.
|
|
"""
|
|
with self.lock:
|
|
now = time.time()
|
|
gen_policy = policy.get('info_json_generation_policy', {})
|
|
rate_limits = gen_policy.get('rate_limits', {})
|
|
|
|
# Check per-IP limit
|
|
ip_limit = rate_limits.get('per_ip')
|
|
if ip_limit:
|
|
tracker_key = 'per_ip'
|
|
max_req = ip_limit.get('max_requests')
|
|
period_min = ip_limit.get('per_minutes')
|
|
if max_req and period_min:
|
|
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
|
|
# Filter out old timestamps
|
|
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
|
|
if len(timestamps) >= max_req:
|
|
logger.warning("Per-IP rate limit reached. Skipping task.")
|
|
return False
|
|
self.state['rate_limit_trackers'][tracker_key] = timestamps
|
|
|
|
# Check per-profile limit
|
|
profile_limit = rate_limits.get('per_profile')
|
|
if profile_limit and profile_name:
|
|
tracker_key = f"profile_{profile_name}"
|
|
max_req = profile_limit.get('max_requests')
|
|
period_min = profile_limit.get('per_minutes')
|
|
if max_req and period_min:
|
|
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
|
|
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
|
|
if len(timestamps) >= max_req:
|
|
logger.warning(f"Per-profile rate limit for '{profile_name}' reached. Skipping task.")
|
|
return False
|
|
self.state['rate_limit_trackers'][tracker_key] = timestamps
|
|
|
|
# If all checks pass, record the new request timestamp for all relevant trackers
|
|
if ip_limit and ip_limit.get('max_requests'):
|
|
self.state['rate_limit_trackers'].setdefault('per_ip', []).append(now)
|
|
if profile_limit and profile_limit.get('max_requests') and profile_name:
|
|
self.state['rate_limit_trackers'].setdefault(f"profile_{profile_name}", []).append(now)
|
|
|
|
return True
|
|
|
|
def get_client_for_request(self, profile_name, gen_policy):
|
|
"""
|
|
Determines which client to use based on the client_rotation_policy.
|
|
Returns a tuple: (client_name, request_params_dict).
|
|
"""
|
|
with self.lock:
|
|
rotation_policy = gen_policy.get('client_rotation_policy')
|
|
|
|
# If no rotation policy, use the simple 'client' key.
|
|
if not rotation_policy:
|
|
client = gen_policy.get('client')
|
|
logger.info(f"Using client '{client}' for profile '{profile_name}'.")
|
|
req_params = gen_policy.get('request_params')
|
|
return client, req_params
|
|
|
|
# --- Rotation logic ---
|
|
now = time.time()
|
|
major_client = rotation_policy.get('major_client')
|
|
refresh_client = rotation_policy.get('refresh_client')
|
|
refresh_every = rotation_policy.get('refresh_every', {})
|
|
|
|
if not refresh_client or not refresh_every:
|
|
return major_client, rotation_policy.get('major_client_params')
|
|
|
|
should_refresh = False
|
|
|
|
# Check time-based refresh
|
|
refresh_minutes = refresh_every.get('minutes')
|
|
last_refresh_time = self.state['profile_last_refresh_time'].get(profile_name, 0)
|
|
if refresh_minutes and (now - last_refresh_time) > (refresh_minutes * 60):
|
|
should_refresh = True
|
|
|
|
# Check request-count-based refresh
|
|
refresh_requests = refresh_every.get('requests')
|
|
request_count = self.state['profile_request_counts'].get(profile_name, 0)
|
|
if refresh_requests and request_count >= refresh_requests:
|
|
should_refresh = True
|
|
|
|
if should_refresh:
|
|
logger.info(f"Profile '{profile_name}' is due for a refresh. Using refresh client '{refresh_client}'.")
|
|
self.state['profile_last_refresh_time'][profile_name] = now
|
|
self.state['profile_request_counts'][profile_name] = 0 # Reset counter
|
|
return refresh_client, rotation_policy.get('refresh_client_params')
|
|
else:
|
|
# Not refreshing, so increment request count for this profile
|
|
self.state['profile_request_counts'][profile_name] = request_count + 1
|
|
return major_client, rotation_policy.get('major_client_params')
|
|
|
|
def get_next_available_profile(self, policy):
|
|
"""
|
|
Finds or creates an available profile based on the dynamic cooldown policy.
|
|
Returns a profile name, or None if no profile is available.
|
|
"""
|
|
with self.lock:
|
|
now = time.time()
|
|
settings = policy.get('settings', {})
|
|
pm_policy = settings.get('profile_management')
|
|
|
|
if not pm_policy:
|
|
return None
|
|
|
|
prefix = pm_policy.get('prefix')
|
|
if not prefix:
|
|
logger.error("Profile management policy requires 'prefix'.")
|
|
return None
|
|
|
|
# Determine and persist the suffix for this run to ensure profile names are stable
|
|
run_suffix = self.state.get('profile_run_suffix')
|
|
if not run_suffix:
|
|
suffix_config = pm_policy.get('suffix')
|
|
if suffix_config == 'auto':
|
|
run_suffix = datetime.now().strftime('%Y%m%d%H%M')
|
|
else:
|
|
run_suffix = suffix_config or ''
|
|
self.state['profile_run_suffix'] = run_suffix
|
|
|
|
# Initialize pool size from policy if not already in state
|
|
if self.state.get('profile_pool_size', 0) == 0:
|
|
self.state['profile_pool_size'] = pm_policy.get('initial_pool_size', 1)
|
|
|
|
max_reqs = pm_policy.get('max_requests_per_profile')
|
|
sleep_mins = pm_policy.get('sleep_minutes_on_exhaustion')
|
|
|
|
# Loop until a profile is found or we decide we can't find one
|
|
while True:
|
|
# Try to find an existing, available profile
|
|
for i in range(self.state['profile_pool_size']):
|
|
profile_name = f"{prefix}_{run_suffix}_{i}" if run_suffix else f"{prefix}_{i}"
|
|
|
|
# Check if sleeping
|
|
sleep_until = self.state['profile_cooldown_sleep_until'].get(profile_name, 0)
|
|
if now < sleep_until:
|
|
continue # Still sleeping
|
|
|
|
# Check if it needs to be put to sleep
|
|
req_count = self.state['profile_cooldown_counts'].get(profile_name, 0)
|
|
if max_reqs and req_count >= max_reqs:
|
|
sleep_duration_seconds = (sleep_mins or 0) * 60
|
|
self.state['profile_cooldown_sleep_until'][profile_name] = now + sleep_duration_seconds
|
|
self.state['profile_cooldown_counts'][profile_name] = 0 # Reset count for next time
|
|
logger.info(f"Profile '{profile_name}' reached request limit ({req_count}/{max_reqs}). Putting to sleep for {sleep_mins} minutes.")
|
|
continue # Now sleeping, try next profile
|
|
|
|
# This profile is available
|
|
logger.info(f"Selected available profile '{profile_name}' (request count: {req_count}/{max_reqs if max_reqs else 'unlimited'}).")
|
|
return profile_name
|
|
|
|
# If we get here, no existing profile was available
|
|
if pm_policy.get('auto_expand_pool'):
|
|
new_profile_index = self.state['profile_pool_size']
|
|
self.state['profile_pool_size'] += 1
|
|
profile_name = f"{prefix}_{run_suffix}_{new_profile_index}" if run_suffix else f"{prefix}_{new_profile_index}"
|
|
logger.info(f"Profile pool exhausted. Expanding pool to size {self.state['profile_pool_size']}. New profile: '{profile_name}'")
|
|
return profile_name
|
|
else:
|
|
# No available profiles and pool expansion is disabled
|
|
return None
|
|
|
|
def get_or_rotate_worker_profile(self, worker_id, policy):
|
|
"""
|
|
Gets the current profile for a worker, rotating to a new generation if the lifetime limit is met.
|
|
This is used by the 'per_worker_with_rotation' profile mode.
|
|
"""
|
|
with self.lock:
|
|
pm_policy = policy.get('settings', {}).get('profile_management', {})
|
|
if not pm_policy:
|
|
logger.error("Profile mode 'per_worker_with_rotation' requires 'settings.profile_management' configuration in the policy.")
|
|
return f"error_profile_{worker_id}"
|
|
|
|
prefix = pm_policy.get('prefix')
|
|
if not prefix:
|
|
logger.error("Profile management for 'per_worker_with_rotation' requires a 'prefix'.")
|
|
return f"error_profile_{worker_id}"
|
|
|
|
max_reqs = pm_policy.get('max_requests_per_profile')
|
|
|
|
generations = self.state.setdefault('worker_profile_generations', {})
|
|
# worker_id is an int, but JSON keys must be strings
|
|
worker_id_str = str(worker_id)
|
|
current_gen = generations.get(worker_id_str, 0)
|
|
|
|
profile_name = f"{prefix}_{worker_id}_{current_gen}"
|
|
|
|
if not max_reqs: # No lifetime limit defined, so never rotate.
|
|
return profile_name
|
|
|
|
req_count = self.state.get('profile_cooldown_counts', {}).get(profile_name, 0)
|
|
|
|
if req_count >= max_reqs:
|
|
logger.info(f"Profile '{profile_name}' reached lifetime request limit ({req_count}/{max_reqs}). Rotating to new generation for worker {worker_id}.")
|
|
new_gen = current_gen + 1
|
|
generations[worker_id_str] = new_gen
|
|
# The request counts for the old profile are implicitly left behind.
|
|
# The new profile will start with a count of 0.
|
|
profile_name = f"{prefix}_{worker_id}_{new_gen}"
|
|
|
|
return profile_name
|
|
|
|
def record_profile_request(self, profile_name):
|
|
"""Increments the request counter for a profile for the cooldown policy."""
|
|
with self.lock:
|
|
if not profile_name:
|
|
return
|
|
counts = self.state.setdefault('profile_cooldown_counts', {})
|
|
counts[profile_name] = counts.get(profile_name, 0) + 1
|
|
|
|
def record_proxy_usage(self, proxy_url):
|
|
"""Records a request timestamp for a given proxy URL for statistical purposes."""
|
|
if not proxy_url:
|
|
return
|
|
with self.lock:
|
|
now = time.time()
|
|
# Use a prefix to avoid collisions with profile names or other keys
|
|
tracker_key = f"proxy_{proxy_url}"
|
|
self.state['rate_limit_trackers'].setdefault(tracker_key, []).append(now)
|
|
|
|
def check_and_update_download_rate_limit(self, proxy_url, policy):
|
|
"""Checks download rate limits. Returns True if allowed, False otherwise."""
|
|
with self.lock:
|
|
now = time.time()
|
|
d_policy = policy.get('download_policy', {})
|
|
rate_limits = d_policy.get('rate_limits', {})
|
|
|
|
# Check per-IP limit
|
|
ip_limit = rate_limits.get('per_ip')
|
|
if ip_limit:
|
|
tracker_key = 'download_per_ip' # Use a distinct key
|
|
max_req = ip_limit.get('max_requests')
|
|
period_min = ip_limit.get('per_minutes')
|
|
if max_req and period_min:
|
|
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
|
|
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
|
|
if len(timestamps) >= max_req:
|
|
logger.warning("Per-IP download rate limit reached. Skipping task.")
|
|
return False
|
|
self.state['rate_limit_trackers'][tracker_key] = timestamps
|
|
|
|
# Check per-proxy limit
|
|
proxy_limit = rate_limits.get('per_proxy')
|
|
if proxy_limit and proxy_url:
|
|
tracker_key = f"download_proxy_{proxy_url}"
|
|
max_req = proxy_limit.get('max_requests')
|
|
period_min = proxy_limit.get('per_minutes')
|
|
if max_req and period_min:
|
|
timestamps = self.state['rate_limit_trackers'].get(tracker_key, [])
|
|
timestamps = [ts for ts in timestamps if now - ts < period_min * 60]
|
|
if len(timestamps) >= max_req:
|
|
logger.warning(f"Per-proxy download rate limit for '{proxy_url}' reached. Skipping task.")
|
|
return False
|
|
self.state['rate_limit_trackers'][tracker_key] = timestamps
|
|
|
|
# If all checks pass, record the new request timestamp for all relevant trackers
|
|
if ip_limit and ip_limit.get('max_requests'):
|
|
self.state['rate_limit_trackers'].setdefault('download_per_ip', []).append(now)
|
|
if proxy_limit and proxy_limit.get('max_requests') and proxy_url:
|
|
self.state['rate_limit_trackers'].setdefault(f"download_proxy_{proxy_url}", []).append(now)
|
|
|
|
return True
|
|
|
|
def wait_for_proxy_cooldown(self, proxy_url, policy):
|
|
"""If a per-proxy sleep is defined, wait until the cooldown period has passed."""
|
|
with self.lock:
|
|
d_policy = policy.get('download_policy', {})
|
|
sleep_duration = d_policy.get('sleep_per_proxy_seconds', 0)
|
|
if not proxy_url or not sleep_duration > 0:
|
|
return
|
|
|
|
last_finish = self.state.setdefault('proxy_last_finish_time', {}).get(proxy_url, 0)
|
|
elapsed = time.time() - last_finish
|
|
|
|
if elapsed < sleep_duration:
|
|
time_to_sleep = sleep_duration - elapsed
|
|
logger.info(f"Proxy '{proxy_url}' was used recently. Sleeping for {time_to_sleep:.2f}s.")
|
|
# Interruptible sleep
|
|
sleep_end_time = time.time() + time_to_sleep
|
|
while time.time() < sleep_end_time:
|
|
if self.shutdown_event.is_set():
|
|
logger.info("Shutdown requested during proxy cooldown sleep.")
|
|
break
|
|
time.sleep(0.2)
|
|
|
|
def update_proxy_finish_time(self, proxy_url):
|
|
"""Updates the last finish time for a proxy."""
|
|
with self.lock:
|
|
if not proxy_url:
|
|
return
|
|
self.state.setdefault('proxy_last_finish_time', {})[proxy_url] = time.time()
|
|
|
|
def print_summary(self, policy=None):
|
|
"""Print a summary of the test run."""
|
|
with self.lock:
|
|
# --- Cumulative Stats from State ---
|
|
now = time.time()
|
|
rate_trackers = self.state.get('rate_limit_trackers', {})
|
|
if rate_trackers:
|
|
logger.info("\n--- Cumulative Rate Summary (All Runs, updated at end of run) ---")
|
|
logger.info("This shows the total number of requests/downloads over various time windows, including previous runs.")
|
|
|
|
fetch_trackers = {k: v for k, v in rate_trackers.items() if not k.startswith('download_')}
|
|
download_trackers = {k: v for k, v in rate_trackers.items() if k.startswith('download_')}
|
|
|
|
def print_tracker_stats(trackers, tracker_type):
|
|
if not trackers:
|
|
logger.info(f"No historical {tracker_type} trackers found.")
|
|
return
|
|
|
|
logger.info(f"Historical {tracker_type} Trackers:")
|
|
for key, timestamps in sorted(trackers.items()):
|
|
windows = {
|
|
'last 10 min': 600, 'last 60 min': 3600,
|
|
'last 6 hours': 21600, 'last 24 hours': 86400
|
|
}
|
|
rates_str_parts = []
|
|
for name, seconds in windows.items():
|
|
count = sum(1 for ts in timestamps if now - ts <= seconds)
|
|
rate_rpm = (count / seconds) * 60 if seconds > 0 else 0
|
|
rates_str_parts.append(f"{count} in {name} ({rate_rpm:.2f}/min)")
|
|
|
|
# Clean up key for display
|
|
display_key = key.replace('download_', '').replace('per_ip', 'all_proxies/ips')
|
|
logger.info(f" - Tracker '{display_key}': " + ", ".join(rates_str_parts))
|
|
|
|
print_tracker_stats(fetch_trackers, "Fetch Request")
|
|
print_tracker_stats(download_trackers, "Download Attempt")
|
|
|
|
if not self.events:
|
|
logger.info("\nNo new events were recorded in this session.")
|
|
return
|
|
|
|
duration = time.time() - self.start_time
|
|
fetch_events = [e for e in self.events if e.get('type') == 'fetch']
|
|
batch_fetch_events = [e for e in self.events if e.get('type') == 'fetch_batch']
|
|
download_events = [e for e in self.events if e.get('type') not in ['fetch', 'fetch_batch']]
|
|
|
|
logger.info("\n--- Test Summary (This Run) ---")
|
|
logger.info(f"Total duration: {duration:.2f} seconds")
|
|
|
|
# Check for batch mode stats from state
|
|
if self.state.get('total_batches_processed', 0) > 0:
|
|
logger.info(f"Total batches processed (cumulative): {self.state['total_batches_processed']}")
|
|
logger.info(f" - Successful: {self.state['successful_batches']}")
|
|
logger.info(f" - Failed: {self.state['failed_batches']}")
|
|
logger.info(f"Total videos processed (cumulative): {self.state['total_videos_processed']}")
|
|
else:
|
|
logger.info(f"Total info.json requests (cumulative): {self.get_request_count()}")
|
|
|
|
if policy:
|
|
logger.info("\n--- Test Configuration ---")
|
|
settings = policy.get('settings', {})
|
|
d_policy = policy.get('download_policy', {})
|
|
|
|
if settings.get('urls_file'):
|
|
logger.info(f"URL source file: {settings['urls_file']}")
|
|
if settings.get('info_json_dir'):
|
|
logger.info(f"Info.json source dir: {settings['info_json_dir']}")
|
|
|
|
if d_policy:
|
|
logger.info(f"Download formats: {d_policy.get('formats', 'N/A')}")
|
|
if d_policy.get('downloader'):
|
|
logger.info(f"Downloader: {d_policy.get('downloader')}")
|
|
if d_policy.get('downloader_args'):
|
|
logger.info(f"Downloader args: {d_policy.get('downloader_args')}")
|
|
if d_policy.get('pause_before_download_seconds'):
|
|
logger.info(f"Pause before download: {d_policy.get('pause_before_download_seconds')}s")
|
|
if d_policy.get('sleep_between_formats'):
|
|
sleep_cfg = d_policy.get('sleep_between_formats')
|
|
logger.info(f"Sleep between formats: {sleep_cfg.get('min_seconds', 0)}-{sleep_cfg.get('max_seconds', 0)}s")
|
|
|
|
if fetch_events:
|
|
total_fetches = len(fetch_events)
|
|
successful_fetches = sum(1 for e in fetch_events if e['success'])
|
|
cancelled_fetches = sum(1 for e in fetch_events if e.get('error_type') == 'Cancelled')
|
|
failed_fetches = total_fetches - successful_fetches - cancelled_fetches
|
|
|
|
logger.info("\n--- Fetch Summary (This Run) ---")
|
|
logger.info(f"Total info.json fetch attempts: {total_fetches}")
|
|
logger.info(f" - Successful: {successful_fetches}")
|
|
logger.info(f" - Failed: {failed_fetches}")
|
|
if cancelled_fetches > 0:
|
|
logger.info(f" - Cancelled: {cancelled_fetches}")
|
|
|
|
completed_fetches = successful_fetches + failed_fetches
|
|
if completed_fetches > 0:
|
|
success_rate = (successful_fetches / completed_fetches) * 100
|
|
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
|
|
elif total_fetches > 0:
|
|
logger.info("Success rate: N/A (no tasks completed)")
|
|
|
|
if duration > 1 and total_fetches > 0:
|
|
rpm = (total_fetches / duration) * 60
|
|
logger.info(f"Actual fetch rate: {rpm:.2f} requests/minute")
|
|
|
|
if failed_fetches > 0:
|
|
error_counts = collections.Counter(
|
|
e.get('error_type', 'Unknown')
|
|
for e in fetch_events if not e['success'] and e.get('error_type') != 'Cancelled'
|
|
)
|
|
logger.info("Failure breakdown:")
|
|
for error_type, count in sorted(error_counts.items()):
|
|
logger.info(f" - {error_type}: {count}")
|
|
|
|
profile_counts = collections.Counter(e.get('profile') for e in fetch_events if e.get('profile'))
|
|
if profile_counts:
|
|
logger.info("Requests per profile:")
|
|
for profile, count in sorted(profile_counts.items()):
|
|
logger.info(f" - {profile}: {count}")
|
|
|
|
proxy_counts = collections.Counter(e.get('proxy_url') for e in fetch_events if e.get('proxy_url'))
|
|
if proxy_counts:
|
|
logger.info("Requests per proxy:")
|
|
for proxy, count in sorted(proxy_counts.items()):
|
|
logger.info(f" - {proxy}: {count}")
|
|
|
|
if batch_fetch_events:
|
|
total_batches = len(batch_fetch_events)
|
|
successful_batches = sum(1 for e in batch_fetch_events if e['success'])
|
|
failed_batches = total_batches - successful_batches
|
|
total_videos_this_run = sum(e.get('video_count', 0) for e in batch_fetch_events)
|
|
|
|
logger.info("\n--- Batch Fetch Summary (This Run) ---")
|
|
logger.info(f"Total batches processed: {total_batches}")
|
|
logger.info(f"Total videos processed: {total_videos_this_run}")
|
|
logger.info(f" - Successful batches: {successful_batches}")
|
|
logger.info(f" - Failed batches: {failed_batches}")
|
|
|
|
profile_counts = collections.Counter(e.get('profile') for e in batch_fetch_events if e.get('profile'))
|
|
if profile_counts:
|
|
logger.info("Batches per profile:")
|
|
for profile, count in sorted(profile_counts.items()):
|
|
logger.info(f" - {profile}: {count}")
|
|
|
|
proxy_counts = collections.Counter(e.get('proxy_url') for e in batch_fetch_events if e.get('proxy_url'))
|
|
if proxy_counts:
|
|
logger.info("Batches per proxy:")
|
|
for proxy, count in sorted(proxy_counts.items()):
|
|
logger.info(f" - {proxy}: {count}")
|
|
|
|
if download_events:
|
|
total_attempts = len(download_events)
|
|
successes = sum(1 for e in download_events if e['success'])
|
|
cancelled = sum(1 for e in download_events if e.get('error_type') == 'Cancelled')
|
|
failures = total_attempts - successes - cancelled
|
|
|
|
# --- Profile Association for Download Events ---
|
|
download_profiles = [e.get('profile') for e in download_events]
|
|
|
|
# For download_only mode, we might need to fall back to regex extraction
|
|
# if the profile wasn't passed down (e.g., no profile grouping).
|
|
profile_regex = None
|
|
if policy:
|
|
settings = policy.get('settings', {})
|
|
if settings.get('mode') == 'download_only':
|
|
profile_regex = settings.get('profile_extraction_regex')
|
|
|
|
if profile_regex:
|
|
for i, e in enumerate(download_events):
|
|
if not download_profiles[i]: # If profile wasn't set in the event
|
|
path = Path(e.get('path', ''))
|
|
match = re.search(profile_regex, path.name)
|
|
if match and match.groups():
|
|
download_profiles[i] = match.group(1)
|
|
|
|
# Replace any remaining Nones with 'unknown_profile'
|
|
download_profiles = [p or 'unknown_profile' for p in download_profiles]
|
|
|
|
num_profiles_used = len(set(p for p in download_profiles if p != 'unknown_profile'))
|
|
|
|
logger.info("\n--- Download Summary (This Run) ---")
|
|
if policy:
|
|
workers = policy.get('execution_control', {}).get('workers', 'N/A')
|
|
logger.info(f"Workers configured: {workers}")
|
|
|
|
logger.info(f"Profiles utilized for downloads: {num_profiles_used}")
|
|
logger.info(f"Total download attempts: {total_attempts}")
|
|
logger.info(f" - Successful: {successes}")
|
|
logger.info(f" - Failed: {failures}")
|
|
if cancelled > 0:
|
|
logger.info(f" - Cancelled: {cancelled}")
|
|
|
|
completed_downloads = successes + failures
|
|
if completed_downloads > 0:
|
|
success_rate = (successes / completed_downloads) * 100
|
|
logger.info(f"Success rate (of completed): {success_rate:.2f}%")
|
|
elif total_attempts > 0:
|
|
logger.info("Success rate: N/A (no tasks completed)")
|
|
|
|
duration_hours = duration / 3600.0
|
|
if duration > 1 and total_attempts > 0:
|
|
dpm = (total_attempts / duration) * 60
|
|
logger.info(f"Actual overall download rate: {dpm:.2f} attempts/minute")
|
|
|
|
total_bytes = sum(e.get('downloaded_bytes', 0) for e in download_events if e['success'])
|
|
if total_bytes > 0:
|
|
logger.info(f"Total data downloaded: {sp_utils.format_size(total_bytes)}")
|
|
|
|
if failures > 0:
|
|
error_counts = collections.Counter(
|
|
e.get('error_type', 'Unknown')
|
|
for e in download_events if not e['success'] and e.get('error_type') != 'Cancelled'
|
|
)
|
|
logger.info("Failure breakdown:")
|
|
for error_type, count in sorted(error_counts.items()):
|
|
logger.info(f" - {error_type}: {count}")
|
|
|
|
# Add profile to each download event for easier counting
|
|
for i, e in enumerate(download_events):
|
|
e['profile'] = download_profiles[i]
|
|
|
|
profile_counts = collections.Counter(e.get('profile') for e in download_events if e.get('profile'))
|
|
if profile_counts:
|
|
logger.info("Downloads per profile:")
|
|
for profile, count in sorted(profile_counts.items()):
|
|
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
|
|
logger.info(f" - {profile}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
|
|
|
|
proxy_counts = collections.Counter(e.get('proxy_url') for e in download_events if e.get('proxy_url'))
|
|
if proxy_counts:
|
|
logger.info("Downloads per proxy:")
|
|
for proxy, count in sorted(proxy_counts.items()):
|
|
rate_per_hour = (count / duration_hours) if duration_hours > 0 else 0
|
|
logger.info(f" - {proxy}: {count} attempts (avg this run: {rate_per_hour:.2f}/hour)")
|
|
|
|
logger.info("--------------------")
|