510 lines
20 KiB
Python
510 lines
20 KiB
Python
import collections.abc
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import shlex
|
|
import sys
|
|
import time
|
|
from copy import deepcopy
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
# This makes the project root the parent directory of 'ytops_client'
|
|
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
|
|
|
|
|
|
def get_video_id(url: str) -> str:
|
|
"""Extracts a YouTube video ID from a URL."""
|
|
match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
|
|
if match:
|
|
return match.group(1)
|
|
match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
|
|
if match:
|
|
return match.group(1)
|
|
if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
|
|
return url
|
|
return "unknown_video_id"
|
|
|
|
|
|
def get_display_name(path_or_url):
|
|
"""Returns a clean name for logging, either a filename or a video ID."""
|
|
if isinstance(path_or_url, Path):
|
|
return path_or_url.name
|
|
|
|
path_str = str(path_or_url)
|
|
video_id = get_video_id(path_str)
|
|
if video_id != "unknown_video_id":
|
|
return video_id
|
|
|
|
return Path(path_str).name
|
|
|
|
|
|
def format_size(b):
|
|
"""Format size in bytes to human-readable string."""
|
|
if b is None:
|
|
return 'N/A'
|
|
if b < 1024:
|
|
return f"{b}B"
|
|
elif b < 1024**2:
|
|
return f"{b/1024:.2f}KiB"
|
|
elif b < 1024**3:
|
|
return f"{b/1024**2:.2f}MiB"
|
|
else:
|
|
return f"{b/1024**3:.2f}GiB"
|
|
|
|
|
|
def flatten_dict(d, parent_key='', sep='.'):
|
|
"""Flattens a nested dictionary."""
|
|
items = {}
|
|
for k, v in d.items():
|
|
new_key = parent_key + sep + k if parent_key else k
|
|
if isinstance(v, collections.abc.MutableMapping):
|
|
items.update(flatten_dict(v, new_key, sep=sep))
|
|
else:
|
|
items[new_key] = v
|
|
return items
|
|
|
|
|
|
def print_policy_overrides(policy):
|
|
"""Prints all policy values as a single-line of --set arguments."""
|
|
# We don't want to include the 'name' key in the overrides.
|
|
policy_copy = deepcopy(policy)
|
|
policy_copy.pop('name', None)
|
|
|
|
flat_policy = flatten_dict(policy_copy)
|
|
|
|
set_args = []
|
|
for key, value in sorted(flat_policy.items()):
|
|
if value is None:
|
|
value_str = 'null'
|
|
elif isinstance(value, bool):
|
|
value_str = str(value).lower()
|
|
elif isinstance(value, (list, dict)):
|
|
# Use compact JSON for lists/dicts
|
|
value_str = json.dumps(value, separators=(',', ':'))
|
|
else:
|
|
value_str = str(value)
|
|
|
|
# Use shlex.quote to handle spaces and special characters safely
|
|
set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}")
|
|
|
|
print(' '.join(set_args))
|
|
|
|
|
|
def _config_dict_to_flags_file_content(config_dict: dict) -> str:
|
|
"""Converts a dictionary of yt-dlp options to a string for a config file."""
|
|
config_lines = []
|
|
for key, value in config_dict.items():
|
|
flag = f'--{key.replace("_", "-")}'
|
|
if isinstance(value, bool):
|
|
if value:
|
|
config_lines.append(flag)
|
|
elif isinstance(value, list):
|
|
# Special case for --use-extractors which takes a comma-separated list
|
|
if key == 'use-extractors':
|
|
config_lines.append(flag)
|
|
config_lines.append(','.join(map(str, value)))
|
|
else: # Assume other lists mean repeated flags
|
|
for item in value:
|
|
config_lines.append(flag)
|
|
config_lines.append(str(item))
|
|
elif isinstance(value, dict): # Primarily for extractor-args
|
|
for sub_key, sub_value in value.items():
|
|
if isinstance(sub_value, str) and ';' in sub_value:
|
|
# Support user-friendly format: semicolon-separated values
|
|
items = [item.strip() for item in sub_value.split(';')]
|
|
for item in items:
|
|
if item: # Avoid empty strings
|
|
config_lines.append(flag)
|
|
config_lines.append(f"{sub_key}:{item}")
|
|
elif isinstance(sub_value, list):
|
|
for item in sub_value:
|
|
config_lines.append(flag)
|
|
config_lines.append(f"{sub_key}:{item}")
|
|
else:
|
|
config_lines.append(flag)
|
|
config_lines.append(f"{sub_key}:{sub_value}")
|
|
else:
|
|
config_lines.append(flag)
|
|
value_str = str(value)
|
|
# yt-dlp config files support quoting arguments.
|
|
# Let's quote any string that contains spaces to be safe.
|
|
if isinstance(value, str) and ' ' in value_str:
|
|
value_str = f'"{value_str}"'
|
|
config_lines.append(value_str)
|
|
return '\n'.join(config_lines)
|
|
|
|
|
|
def _config_dict_to_cli_flags(config_dict: dict) -> list:
|
|
"""Converts a dictionary of yt-dlp options to a list of command-line arguments."""
|
|
args = []
|
|
for key, value in config_dict.items():
|
|
flag = f'--{key.replace("_", "-")}'
|
|
if isinstance(value, bool):
|
|
if value:
|
|
args.append(flag)
|
|
elif isinstance(value, list):
|
|
if key == 'use-extractors':
|
|
args.append(flag)
|
|
args.append(','.join(map(str, value)))
|
|
else:
|
|
for item in value:
|
|
args.append(flag)
|
|
args.append(str(item))
|
|
elif isinstance(value, dict):
|
|
for sub_key, sub_value in value.items():
|
|
if isinstance(sub_value, str) and ';' in sub_value:
|
|
items = [item.strip() for item in sub_value.split(';')]
|
|
for item in items:
|
|
if item:
|
|
args.append(flag)
|
|
args.append(f"{sub_key}:{item}")
|
|
elif isinstance(sub_value, list):
|
|
for item in sub_value:
|
|
args.append(flag)
|
|
args.append(f"{sub_key}:{item}")
|
|
else:
|
|
args.append(flag)
|
|
args.append(f"{sub_key}:{sub_value}")
|
|
else:
|
|
args.append(flag)
|
|
args.append(str(value))
|
|
return args
|
|
|
|
|
|
def _parse_config_file_to_cli_args(content: str) -> list:
|
|
"""
|
|
Parses yt-dlp config file content into a list of command-line arguments.
|
|
This is a best-effort parser for logging purposes.
|
|
"""
|
|
args = []
|
|
lines = content.splitlines()
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
# yt-dlp config files can have options and values on separate lines.
|
|
# This simple parser assumes one argument per line (e.g., '--proxy', 'http://...').
|
|
# shlex.split is good for handling quoted arguments on a single line.
|
|
try:
|
|
parts = shlex.split(line)
|
|
args.extend(parts)
|
|
except ValueError:
|
|
# Fallback for unterminated quotes or other shlex errors
|
|
args.extend(line.split())
|
|
return args
|
|
|
|
|
|
def check_url_expiry(url: str, time_shift_minutes: int):
|
|
"""
|
|
Checks a single URL for expiration, considering a time shift.
|
|
Returns a tuple: (status, time_left_seconds)
|
|
status can be 'valid', 'expired', or 'no_expiry_info'.
|
|
A URL is considered 'expired' if it has expired or will expire within the time_shift_minutes.
|
|
"""
|
|
now = time.time()
|
|
parsed = urlparse(url)
|
|
query_params = parse_qs(parsed.query)
|
|
expire_ts_str = query_params.get('expire', [None])[0]
|
|
|
|
if not expire_ts_str or not expire_ts_str.isdigit():
|
|
return 'no_expiry_info', float('inf')
|
|
|
|
expire_ts = int(expire_ts_str)
|
|
time_left = expire_ts - now
|
|
|
|
if time_left <= time_shift_minutes * 60:
|
|
return 'expired', time_left
|
|
|
|
return 'valid', time_left
|
|
|
|
|
|
def generate_user_agent_from_policy(policy):
|
|
"""
|
|
Generates a User-Agent string based on settings in the policy.
|
|
Checks 'direct_docker_cli_policy' and 'direct_batch_cli_policy'.
|
|
Falls back to a default if no policy is provided.
|
|
"""
|
|
# Check both possible policy keys for the settings.
|
|
direct_policy = policy.get('direct_docker_cli_policy', {}) or policy.get('direct_batch_cli_policy', {})
|
|
template = direct_policy.get('user_agent_template')
|
|
version_range = direct_policy.get('user_agent_version_range')
|
|
|
|
if template and version_range and isinstance(version_range, list) and len(version_range) == 2:
|
|
major_version = random.randint(version_range[0], version_range[1])
|
|
return template.format(major_version=major_version)
|
|
|
|
# Fallback to a generic UA if policy is not configured
|
|
return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
|
|
|
|
|
def update_dict(d, u):
|
|
"""Recursively update a dictionary."""
|
|
for k, v in u.items():
|
|
if isinstance(v, collections.abc.Mapping):
|
|
d[k] = update_dict(d.get(k, {}), v)
|
|
else:
|
|
d[k] = v
|
|
return d
|
|
|
|
|
|
def load_policy(policy_file, policy_name=None):
|
|
"""Load a policy from a YAML file."""
|
|
logger = logging.getLogger(__name__)
|
|
try:
|
|
with open(policy_file, 'r', encoding='utf-8') as f:
|
|
# If a policy name is given, look for that specific document
|
|
if policy_name:
|
|
docs = list(yaml.safe_load_all(f))
|
|
for doc in docs:
|
|
if isinstance(doc, dict) and doc.get('name') == policy_name:
|
|
return doc
|
|
raise ValueError(f"Policy '{policy_name}' not found in {policy_file}")
|
|
# Otherwise, load the first document
|
|
return yaml.safe_load(f)
|
|
except (IOError, yaml.YAMLError, ValueError) as e:
|
|
logger.error(f"Failed to load policy file {policy_file}: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def apply_overrides(policy, overrides):
|
|
"""Apply command-line overrides to the policy."""
|
|
logger = logging.getLogger(__name__)
|
|
for override in overrides:
|
|
try:
|
|
key, value = override.split('=', 1)
|
|
keys = key.split('.')
|
|
|
|
# Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar
|
|
if (value.startswith('[') and value.endswith(']')) or \
|
|
(value.startswith('{') and value.endswith('}')):
|
|
try:
|
|
value = yaml.safe_load(value)
|
|
except yaml.YAMLError:
|
|
logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.")
|
|
else:
|
|
# Try to auto-convert scalar value type
|
|
if value.lower() == 'true':
|
|
value = True
|
|
elif value.lower() == 'false':
|
|
value = False
|
|
elif value.lower() == 'null':
|
|
value = None
|
|
else:
|
|
try:
|
|
value = int(value)
|
|
except ValueError:
|
|
try:
|
|
value = float(value)
|
|
except ValueError:
|
|
pass # Keep as string
|
|
|
|
d = policy
|
|
for k in keys[:-1]:
|
|
d = d.setdefault(k, {})
|
|
d[keys[-1]] = value
|
|
except ValueError:
|
|
logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.")
|
|
sys.exit(1)
|
|
return policy
|
|
|
|
|
|
def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None):
|
|
"""Prints a human-readable summary of the effective policy."""
|
|
logger = logging.getLogger(__name__)
|
|
logger.info(f"--- Effective Policy: {name} ---")
|
|
settings = policy.get('settings', {})
|
|
exec_control = policy.get('execution_control', {})
|
|
orchestration_mode = settings.get('orchestration_mode')
|
|
|
|
logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
|
|
if args and args.profile_prefix:
|
|
logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}")
|
|
if profile_names:
|
|
num_profiles = len(profile_names)
|
|
logger.info(f"Profiles found: {num_profiles}")
|
|
if num_profiles > 0:
|
|
# Sort profiles for consistent display, show top 10
|
|
sorted_profiles = sorted(profile_names)
|
|
profiles_to_show = sorted_profiles[:10]
|
|
logger.info(f" (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})")
|
|
|
|
workers_display = str(exec_control.get('workers', 1))
|
|
if original_workers_setting == 'auto':
|
|
workers_display = f"auto (calculated: {workers_display})"
|
|
logger.info(f"Workers: {workers_display}")
|
|
|
|
sleep_cfg = exec_control.get('sleep_between_tasks', {})
|
|
sleep_min = sleep_cfg.get('min_seconds')
|
|
if sleep_min is not None:
|
|
sleep_max = sleep_cfg.get('max_seconds')
|
|
if sleep_max is None:
|
|
sleep_max = sleep_min
|
|
|
|
if sleep_max < sleep_min:
|
|
logger.info(f"Sleep between tasks (per worker): {sleep_max}s (fixed; max < min)")
|
|
elif sleep_max > sleep_min:
|
|
logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)")
|
|
else:
|
|
logger.info(f"Sleep between tasks (per worker): {sleep_min}s")
|
|
|
|
run_until = exec_control.get('run_until', {})
|
|
run_conditions = []
|
|
if 'minutes' in run_until:
|
|
run_conditions.append(f"for {run_until['minutes']} minutes")
|
|
if 'requests' in run_until:
|
|
run_conditions.append(f"until {run_until['requests']} total requests")
|
|
if 'cycles' in run_until:
|
|
run_conditions.append(f"for {run_until['cycles']} cycles")
|
|
|
|
if run_conditions:
|
|
logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.")
|
|
if 'minutes' in run_until and 'cycles' not in run_until:
|
|
logger.info("Will continuously cycle through sources until time limit is reached.")
|
|
elif orchestration_mode in ['direct_batch_cli', 'direct_download_cli', 'direct_docker_cli']:
|
|
logger.info("Run condition: Stop after all source URLs/tasks have been processed once.")
|
|
else:
|
|
logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.")
|
|
logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).")
|
|
|
|
# --- Rate Calculation ---
|
|
if sources:
|
|
workers = exec_control.get('workers', 1)
|
|
num_sources = len(profile_names) if profile_names else len(sources)
|
|
|
|
min_sleep = sleep_cfg.get('min_seconds', 0)
|
|
max_sleep = sleep_cfg.get('max_seconds') or min_sleep
|
|
avg_sleep_per_task = (min_sleep + max_sleep) / 2
|
|
|
|
# Assume an average task duration. This is a major assumption.
|
|
mode = settings.get('mode', 'full_stack')
|
|
assumptions = exec_control.get('assumptions', {})
|
|
|
|
assumed_fetch_duration = 0
|
|
if mode in ['full_stack', 'fetch_only']:
|
|
assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3)
|
|
|
|
assumed_download_duration = 0
|
|
if mode in ['full_stack', 'download_only']:
|
|
# This assumes the total time to download all formats for a single source.
|
|
assumed_download_duration = assumptions.get('download_task_duration', 60)
|
|
|
|
total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration
|
|
|
|
if workers > 0 and total_assumed_task_duration > 0:
|
|
total_time_per_task = total_assumed_task_duration + avg_sleep_per_task
|
|
tasks_per_minute_per_worker = 60 / total_time_per_task
|
|
total_tasks_per_minute = tasks_per_minute_per_worker * workers
|
|
|
|
logger.info("--- Rate Estimation ---")
|
|
logger.info(f"Source count: {num_sources}")
|
|
if mode in ['full_stack', 'fetch_only']:
|
|
logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)")
|
|
if mode in ['full_stack', 'download_only']:
|
|
logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)")
|
|
logger.info(" (Note: This assumes total time for all formats per source)")
|
|
|
|
logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s")
|
|
logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)")
|
|
|
|
target_rate_cfg = exec_control.get('target_rate', {})
|
|
target_reqs = target_rate_cfg.get('requests')
|
|
target_mins = target_rate_cfg.get('per_minutes')
|
|
if target_reqs and target_mins:
|
|
target_rpm = target_reqs / target_mins
|
|
logger.info(f"Target rate: {target_rpm:.2f} tasks/minute")
|
|
if total_tasks_per_minute < target_rpm * 0.8:
|
|
logger.warning("Warning: Expected rate is significantly lower than target rate.")
|
|
logger.warning("Consider increasing workers, reducing sleep, or checking task performance.")
|
|
|
|
logger.info("---------------------------------")
|
|
time.sleep(2) # Give user time to read
|
|
|
|
|
|
def list_policies():
|
|
"""Scans the policies directory and prints a list of available policies."""
|
|
policies_dir = os.path.join(_PROJECT_ROOT, 'policies')
|
|
|
|
if not os.path.isdir(policies_dir):
|
|
print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr)
|
|
return 1
|
|
|
|
print("Available Policies:")
|
|
print("=" * 20)
|
|
|
|
policy_files = sorted(Path(policies_dir).glob('*.yaml'))
|
|
if not policy_files:
|
|
print("No policy files (.yaml) found.")
|
|
return 0
|
|
|
|
for policy_file in policy_files:
|
|
print(f"\n--- File: {policy_file.relative_to(_PROJECT_ROOT)} ---")
|
|
try:
|
|
with open(policy_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Split into documents. The separator is a line that is exactly '---'.
|
|
documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE)
|
|
|
|
found_any_in_file = False
|
|
for doc in documents:
|
|
doc = doc.strip()
|
|
if not doc:
|
|
continue
|
|
|
|
lines = doc.split('\n')
|
|
policy_name = None
|
|
description_lines = []
|
|
|
|
# Find name and description
|
|
for i, line in enumerate(lines):
|
|
if line.strip().startswith('name:'):
|
|
policy_name = line.split(':', 1)[1].strip()
|
|
|
|
# Look backwards for comments
|
|
j = i - 1
|
|
current_desc_block = []
|
|
while j >= 0 and lines[j].strip().startswith('#'):
|
|
comment = lines[j].strip().lstrip('#').strip()
|
|
current_desc_block.insert(0, comment)
|
|
j -= 1
|
|
|
|
if current_desc_block:
|
|
description_lines = current_desc_block
|
|
break
|
|
|
|
if policy_name:
|
|
found_any_in_file = True
|
|
print(f" - Name: {policy_name}")
|
|
if description_lines:
|
|
# Heuristic to clean up "Policy: " prefix
|
|
if description_lines[0].lower().startswith('policy:'):
|
|
description_lines[0] = description_lines[0][len('policy:'):].strip()
|
|
|
|
print(f" Description: {description_lines[0]}")
|
|
for desc_line in description_lines[1:]:
|
|
print(f" {desc_line}")
|
|
else:
|
|
print(" Description: (No description found)")
|
|
|
|
relative_path = policy_file.relative_to(_PROJECT_ROOT)
|
|
print(f" Usage: --policy {relative_path} --policy-name {policy_name}")
|
|
|
|
if not found_any_in_file:
|
|
print(" (No named policies found in this file)")
|
|
|
|
except Exception as e:
|
|
print(f" Error parsing {policy_file.name}: {e}")
|
|
|
|
return 0
|