510 lines
20 KiB
Python

import collections.abc
import json
import logging
import os
import random
import re
import shlex
import sys
import time
from copy import deepcopy
from pathlib import Path
from urllib.parse import urlparse, parse_qs
try:
import yaml
except ImportError:
print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
sys.exit(1)
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# This makes the project root the parent directory of 'ytops_client'
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
def get_video_id(url: str) -> str:
"""Extracts a YouTube video ID from a URL."""
match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
if match:
return match.group(1)
if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
return url
return "unknown_video_id"
def get_display_name(path_or_url):
"""Returns a clean name for logging, either a filename or a video ID."""
if isinstance(path_or_url, Path):
return path_or_url.name
path_str = str(path_or_url)
video_id = get_video_id(path_str)
if video_id != "unknown_video_id":
return video_id
return Path(path_str).name
def format_size(b):
"""Format size in bytes to human-readable string."""
if b is None:
return 'N/A'
if b < 1024:
return f"{b}B"
elif b < 1024**2:
return f"{b/1024:.2f}KiB"
elif b < 1024**3:
return f"{b/1024**2:.2f}MiB"
else:
return f"{b/1024**3:.2f}GiB"
def flatten_dict(d, parent_key='', sep='.'):
"""Flattens a nested dictionary."""
items = {}
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.abc.MutableMapping):
items.update(flatten_dict(v, new_key, sep=sep))
else:
items[new_key] = v
return items
def print_policy_overrides(policy):
"""Prints all policy values as a single-line of --set arguments."""
# We don't want to include the 'name' key in the overrides.
policy_copy = deepcopy(policy)
policy_copy.pop('name', None)
flat_policy = flatten_dict(policy_copy)
set_args = []
for key, value in sorted(flat_policy.items()):
if value is None:
value_str = 'null'
elif isinstance(value, bool):
value_str = str(value).lower()
elif isinstance(value, (list, dict)):
# Use compact JSON for lists/dicts
value_str = json.dumps(value, separators=(',', ':'))
else:
value_str = str(value)
# Use shlex.quote to handle spaces and special characters safely
set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}")
print(' '.join(set_args))
def _config_dict_to_flags_file_content(config_dict: dict) -> str:
"""Converts a dictionary of yt-dlp options to a string for a config file."""
config_lines = []
for key, value in config_dict.items():
flag = f'--{key.replace("_", "-")}'
if isinstance(value, bool):
if value:
config_lines.append(flag)
elif isinstance(value, list):
# Special case for --use-extractors which takes a comma-separated list
if key == 'use-extractors':
config_lines.append(flag)
config_lines.append(','.join(map(str, value)))
else: # Assume other lists mean repeated flags
for item in value:
config_lines.append(flag)
config_lines.append(str(item))
elif isinstance(value, dict): # Primarily for extractor-args
for sub_key, sub_value in value.items():
if isinstance(sub_value, str) and ';' in sub_value:
# Support user-friendly format: semicolon-separated values
items = [item.strip() for item in sub_value.split(';')]
for item in items:
if item: # Avoid empty strings
config_lines.append(flag)
config_lines.append(f"{sub_key}:{item}")
elif isinstance(sub_value, list):
for item in sub_value:
config_lines.append(flag)
config_lines.append(f"{sub_key}:{item}")
else:
config_lines.append(flag)
config_lines.append(f"{sub_key}:{sub_value}")
else:
config_lines.append(flag)
value_str = str(value)
# yt-dlp config files support quoting arguments.
# Let's quote any string that contains spaces to be safe.
if isinstance(value, str) and ' ' in value_str:
value_str = f'"{value_str}"'
config_lines.append(value_str)
return '\n'.join(config_lines)
def _config_dict_to_cli_flags(config_dict: dict) -> list:
"""Converts a dictionary of yt-dlp options to a list of command-line arguments."""
args = []
for key, value in config_dict.items():
flag = f'--{key.replace("_", "-")}'
if isinstance(value, bool):
if value:
args.append(flag)
elif isinstance(value, list):
if key == 'use-extractors':
args.append(flag)
args.append(','.join(map(str, value)))
else:
for item in value:
args.append(flag)
args.append(str(item))
elif isinstance(value, dict):
for sub_key, sub_value in value.items():
if isinstance(sub_value, str) and ';' in sub_value:
items = [item.strip() for item in sub_value.split(';')]
for item in items:
if item:
args.append(flag)
args.append(f"{sub_key}:{item}")
elif isinstance(sub_value, list):
for item in sub_value:
args.append(flag)
args.append(f"{sub_key}:{item}")
else:
args.append(flag)
args.append(f"{sub_key}:{sub_value}")
else:
args.append(flag)
args.append(str(value))
return args
def _parse_config_file_to_cli_args(content: str) -> list:
"""
Parses yt-dlp config file content into a list of command-line arguments.
This is a best-effort parser for logging purposes.
"""
args = []
lines = content.splitlines()
for line in lines:
line = line.strip()
if not line or line.startswith('#'):
continue
# yt-dlp config files can have options and values on separate lines.
# This simple parser assumes one argument per line (e.g., '--proxy', 'http://...').
# shlex.split is good for handling quoted arguments on a single line.
try:
parts = shlex.split(line)
args.extend(parts)
except ValueError:
# Fallback for unterminated quotes or other shlex errors
args.extend(line.split())
return args
def check_url_expiry(url: str, time_shift_minutes: int):
"""
Checks a single URL for expiration, considering a time shift.
Returns a tuple: (status, time_left_seconds)
status can be 'valid', 'expired', or 'no_expiry_info'.
A URL is considered 'expired' if it has expired or will expire within the time_shift_minutes.
"""
now = time.time()
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
expire_ts_str = query_params.get('expire', [None])[0]
if not expire_ts_str or not expire_ts_str.isdigit():
return 'no_expiry_info', float('inf')
expire_ts = int(expire_ts_str)
time_left = expire_ts - now
if time_left <= time_shift_minutes * 60:
return 'expired', time_left
return 'valid', time_left
def generate_user_agent_from_policy(policy):
"""
Generates a User-Agent string based on settings in the policy.
Checks 'direct_docker_cli_policy' and 'direct_batch_cli_policy'.
Falls back to a default if no policy is provided.
"""
# Check both possible policy keys for the settings.
direct_policy = policy.get('direct_docker_cli_policy', {}) or policy.get('direct_batch_cli_policy', {})
template = direct_policy.get('user_agent_template')
version_range = direct_policy.get('user_agent_version_range')
if template and version_range and isinstance(version_range, list) and len(version_range) == 2:
major_version = random.randint(version_range[0], version_range[1])
return template.format(major_version=major_version)
# Fallback to a generic UA if policy is not configured
return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
def update_dict(d, u):
"""Recursively update a dictionary."""
for k, v in u.items():
if isinstance(v, collections.abc.Mapping):
d[k] = update_dict(d.get(k, {}), v)
else:
d[k] = v
return d
def load_policy(policy_file, policy_name=None):
"""Load a policy from a YAML file."""
logger = logging.getLogger(__name__)
try:
with open(policy_file, 'r', encoding='utf-8') as f:
# If a policy name is given, look for that specific document
if policy_name:
docs = list(yaml.safe_load_all(f))
for doc in docs:
if isinstance(doc, dict) and doc.get('name') == policy_name:
return doc
raise ValueError(f"Policy '{policy_name}' not found in {policy_file}")
# Otherwise, load the first document
return yaml.safe_load(f)
except (IOError, yaml.YAMLError, ValueError) as e:
logger.error(f"Failed to load policy file {policy_file}: {e}")
sys.exit(1)
def apply_overrides(policy, overrides):
"""Apply command-line overrides to the policy."""
logger = logging.getLogger(__name__)
for override in overrides:
try:
key, value = override.split('=', 1)
keys = key.split('.')
# Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar
if (value.startswith('[') and value.endswith(']')) or \
(value.startswith('{') and value.endswith('}')):
try:
value = yaml.safe_load(value)
except yaml.YAMLError:
logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.")
else:
# Try to auto-convert scalar value type
if value.lower() == 'true':
value = True
elif value.lower() == 'false':
value = False
elif value.lower() == 'null':
value = None
else:
try:
value = int(value)
except ValueError:
try:
value = float(value)
except ValueError:
pass # Keep as string
d = policy
for k in keys[:-1]:
d = d.setdefault(k, {})
d[keys[-1]] = value
except ValueError:
logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.")
sys.exit(1)
return policy
def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None):
"""Prints a human-readable summary of the effective policy."""
logger = logging.getLogger(__name__)
logger.info(f"--- Effective Policy: {name} ---")
settings = policy.get('settings', {})
exec_control = policy.get('execution_control', {})
orchestration_mode = settings.get('orchestration_mode')
logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
if args and args.profile_prefix:
logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}")
if profile_names:
num_profiles = len(profile_names)
logger.info(f"Profiles found: {num_profiles}")
if num_profiles > 0:
# Sort profiles for consistent display, show top 10
sorted_profiles = sorted(profile_names)
profiles_to_show = sorted_profiles[:10]
logger.info(f" (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})")
workers_display = str(exec_control.get('workers', 1))
if original_workers_setting == 'auto':
workers_display = f"auto (calculated: {workers_display})"
logger.info(f"Workers: {workers_display}")
sleep_cfg = exec_control.get('sleep_between_tasks', {})
sleep_min = sleep_cfg.get('min_seconds')
if sleep_min is not None:
sleep_max = sleep_cfg.get('max_seconds')
if sleep_max is None:
sleep_max = sleep_min
if sleep_max < sleep_min:
logger.info(f"Sleep between tasks (per worker): {sleep_max}s (fixed; max < min)")
elif sleep_max > sleep_min:
logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)")
else:
logger.info(f"Sleep between tasks (per worker): {sleep_min}s")
run_until = exec_control.get('run_until', {})
run_conditions = []
if 'minutes' in run_until:
run_conditions.append(f"for {run_until['minutes']} minutes")
if 'requests' in run_until:
run_conditions.append(f"until {run_until['requests']} total requests")
if 'cycles' in run_until:
run_conditions.append(f"for {run_until['cycles']} cycles")
if run_conditions:
logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.")
if 'minutes' in run_until and 'cycles' not in run_until:
logger.info("Will continuously cycle through sources until time limit is reached.")
elif orchestration_mode in ['direct_batch_cli', 'direct_download_cli', 'direct_docker_cli']:
logger.info("Run condition: Stop after all source URLs/tasks have been processed once.")
else:
logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.")
logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).")
# --- Rate Calculation ---
if sources:
workers = exec_control.get('workers', 1)
num_sources = len(profile_names) if profile_names else len(sources)
min_sleep = sleep_cfg.get('min_seconds', 0)
max_sleep = sleep_cfg.get('max_seconds') or min_sleep
avg_sleep_per_task = (min_sleep + max_sleep) / 2
# Assume an average task duration. This is a major assumption.
mode = settings.get('mode', 'full_stack')
assumptions = exec_control.get('assumptions', {})
assumed_fetch_duration = 0
if mode in ['full_stack', 'fetch_only']:
assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3)
assumed_download_duration = 0
if mode in ['full_stack', 'download_only']:
# This assumes the total time to download all formats for a single source.
assumed_download_duration = assumptions.get('download_task_duration', 60)
total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration
if workers > 0 and total_assumed_task_duration > 0:
total_time_per_task = total_assumed_task_duration + avg_sleep_per_task
tasks_per_minute_per_worker = 60 / total_time_per_task
total_tasks_per_minute = tasks_per_minute_per_worker * workers
logger.info("--- Rate Estimation ---")
logger.info(f"Source count: {num_sources}")
if mode in ['full_stack', 'fetch_only']:
logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)")
if mode in ['full_stack', 'download_only']:
logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)")
logger.info(" (Note: This assumes total time for all formats per source)")
logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s")
logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)")
target_rate_cfg = exec_control.get('target_rate', {})
target_reqs = target_rate_cfg.get('requests')
target_mins = target_rate_cfg.get('per_minutes')
if target_reqs and target_mins:
target_rpm = target_reqs / target_mins
logger.info(f"Target rate: {target_rpm:.2f} tasks/minute")
if total_tasks_per_minute < target_rpm * 0.8:
logger.warning("Warning: Expected rate is significantly lower than target rate.")
logger.warning("Consider increasing workers, reducing sleep, or checking task performance.")
logger.info("---------------------------------")
time.sleep(2) # Give user time to read
def list_policies():
"""Scans the policies directory and prints a list of available policies."""
policies_dir = os.path.join(_PROJECT_ROOT, 'policies')
if not os.path.isdir(policies_dir):
print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr)
return 1
print("Available Policies:")
print("=" * 20)
policy_files = sorted(Path(policies_dir).glob('*.yaml'))
if not policy_files:
print("No policy files (.yaml) found.")
return 0
for policy_file in policy_files:
print(f"\n--- File: {policy_file.relative_to(_PROJECT_ROOT)} ---")
try:
with open(policy_file, 'r', encoding='utf-8') as f:
content = f.read()
# Split into documents. The separator is a line that is exactly '---'.
documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE)
found_any_in_file = False
for doc in documents:
doc = doc.strip()
if not doc:
continue
lines = doc.split('\n')
policy_name = None
description_lines = []
# Find name and description
for i, line in enumerate(lines):
if line.strip().startswith('name:'):
policy_name = line.split(':', 1)[1].strip()
# Look backwards for comments
j = i - 1
current_desc_block = []
while j >= 0 and lines[j].strip().startswith('#'):
comment = lines[j].strip().lstrip('#').strip()
current_desc_block.insert(0, comment)
j -= 1
if current_desc_block:
description_lines = current_desc_block
break
if policy_name:
found_any_in_file = True
print(f" - Name: {policy_name}")
if description_lines:
# Heuristic to clean up "Policy: " prefix
if description_lines[0].lower().startswith('policy:'):
description_lines[0] = description_lines[0][len('policy:'):].strip()
print(f" Description: {description_lines[0]}")
for desc_line in description_lines[1:]:
print(f" {desc_line}")
else:
print(" Description: (No description found)")
relative_path = policy_file.relative_to(_PROJECT_ROOT)
print(f" Usage: --policy {relative_path} --policy-name {policy_name}")
if not found_any_in_file:
print(" (No named policies found in this file)")
except Exception as e:
print(f" Error parsing {policy_file.name}: {e}")
return 0