yt-dlp-dags/ytops_client-source/ytops_client/stress_policy/utils.py

import collections.abc
import json
import logging
import os
import random
import re
import shlex
import sys
import time
from copy import deepcopy
from pathlib import Path
from urllib.parse import urlparse, parse_qs

try:
    import yaml
except ImportError:
    print("PyYAML is not installed. Please install it with: pip install PyYAML", file=sys.stderr)
    sys.exit(1)

_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# This makes the project root the parent directory of 'ytops_client'
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))


def get_video_id(url: str) -> str:
    """Extracts a YouTube video ID from a URL."""
    match = re.search(r"v=([0-9A-Za-z_-]{11})", url)
    if match:
        return match.group(1)
    match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url)
    if match:
        return match.group(1)
    if re.fullmatch(r'[0-9A-Za-z_-]{11}', url):
        return url
    return "unknown_video_id"


def get_display_name(path_or_url):
    """Returns a clean name for logging, either a filename or a video ID."""
    if isinstance(path_or_url, Path):
        return path_or_url.name

    path_str = str(path_or_url)
    video_id = get_video_id(path_str)
    if video_id != "unknown_video_id":
        return video_id

    return Path(path_str).name


def format_size(b):
    """Format size in bytes to human-readable string."""
    if b is None:
        return 'N/A'
    if b < 1024:
        return f"{b}B"
    elif b < 1024**2:
        return f"{b/1024:.2f}KiB"
    elif b < 1024**3:
        return f"{b/1024**2:.2f}MiB"
    else:
        return f"{b/1024**3:.2f}GiB"


def flatten_dict(d, parent_key='', sep='.'):
    """Flattens a nested dictionary."""
    items = {}
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.abc.MutableMapping):
            items.update(flatten_dict(v, new_key, sep=sep))
        else:
            items[new_key] = v
    return items


def print_policy_overrides(policy):
    """Prints all policy values as a single-line of --set arguments."""
    # We don't want to include the 'name' key in the overrides.
    policy_copy = deepcopy(policy)
    policy_copy.pop('name', None)

    flat_policy = flatten_dict(policy_copy)

    set_args = []
    for key, value in sorted(flat_policy.items()):
        if value is None:
            value_str = 'null'
        elif isinstance(value, bool):
            value_str = str(value).lower()
        elif isinstance(value, (list, dict)):
            # Use compact JSON for lists/dicts
            value_str = json.dumps(value, separators=(',', ':'))
        else:
            value_str = str(value)

        # Use shlex.quote to handle spaces and special characters safely
        set_args.append(f"--set {shlex.quote(f'{key}={value_str}')}")

    print(' '.join(set_args))


def _config_dict_to_flags_file_content(config_dict: dict) -> str:
    """Converts a dictionary of yt-dlp options to a string for a config file."""
    config_lines = []
    for key, value in config_dict.items():
        flag = f'--{key.replace("_", "-")}'
        if isinstance(value, bool):
            if value:
                config_lines.append(flag)
        elif isinstance(value, list):
            # Special case for --use-extractors which takes a comma-separated list
            if key == 'use-extractors':
                config_lines.append(flag)
                config_lines.append(','.join(map(str, value)))
            else:  # Assume other lists mean repeated flags
                for item in value:
                    config_lines.append(flag)
                    config_lines.append(str(item))
        elif isinstance(value, dict):  # Primarily for extractor-args
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, str) and ';' in sub_value:
                    # Support user-friendly format: semicolon-separated values
                    items = [item.strip() for item in sub_value.split(';')]
                    for item in items:
                        if item:  # Avoid empty strings
                            config_lines.append(flag)
                            config_lines.append(f"{sub_key}:{item}")
                elif isinstance(sub_value, list):
                    for item in sub_value:
                        config_lines.append(flag)
                        config_lines.append(f"{sub_key}:{item}")
                else:
                    config_lines.append(flag)
                    config_lines.append(f"{sub_key}:{sub_value}")
        else:
            config_lines.append(flag)
            value_str = str(value)
            # yt-dlp config files support quoting arguments.
            # Let's quote any string that contains spaces to be safe.
            if isinstance(value, str) and ' ' in value_str:
                value_str = f'"{value_str}"'
            config_lines.append(value_str)
    return '\n'.join(config_lines)


def _config_dict_to_cli_flags(config_dict: dict) -> list:
    """Converts a dictionary of yt-dlp options to a list of command-line arguments."""
    args = []
    for key, value in config_dict.items():
        flag = f'--{key.replace("_", "-")}'
        if isinstance(value, bool):
            if value:
                args.append(flag)
        elif isinstance(value, list):
            if key == 'use-extractors':
                args.append(flag)
                args.append(','.join(map(str, value)))
            else:
                for item in value:
                    args.append(flag)
                    args.append(str(item))
        elif isinstance(value, dict):
            for sub_key, sub_value in value.items():
                if isinstance(sub_value, str) and ';' in sub_value:
                    items = [item.strip() for item in sub_value.split(';')]
                    for item in items:
                        if item:
                            args.append(flag)
                            args.append(f"{sub_key}:{item}")
                elif isinstance(sub_value, list):
                    for item in sub_value:
                        args.append(flag)
                        args.append(f"{sub_key}:{item}")
                else:
                    args.append(flag)
                    args.append(f"{sub_key}:{sub_value}")
        else:
            args.append(flag)
            args.append(str(value))
    return args


def _parse_config_file_to_cli_args(content: str) -> list:
    """
    Parses yt-dlp config file content into a list of command-line arguments.
    This is a best-effort parser for logging purposes.
    """
    args = []
    lines = content.splitlines()
    for line in lines:
        line = line.strip()
        if not line or line.startswith('#'):
            continue

        # yt-dlp config files can have options and values on separate lines.
        # This simple parser assumes one argument per line (e.g., '--proxy', 'http://...').
        # shlex.split is good for handling quoted arguments on a single line.
        try:
            parts = shlex.split(line)
            args.extend(parts)
        except ValueError:
            # Fallback for unterminated quotes or other shlex errors
            args.extend(line.split())
    return args


def check_url_expiry(url: str, time_shift_minutes: int):
    """
    Checks a single URL for expiration, considering a time shift.
    Returns a tuple: (status, time_left_seconds)
    status can be 'valid', 'expired', or 'no_expiry_info'.
    A URL is considered 'expired' if it has expired or will expire within the time_shift_minutes.
    """
    now = time.time()
    parsed = urlparse(url)
    query_params = parse_qs(parsed.query)
    expire_ts_str = query_params.get('expire', [None])[0]

    if not expire_ts_str or not expire_ts_str.isdigit():
        return 'no_expiry_info', float('inf')

    expire_ts = int(expire_ts_str)
    time_left = expire_ts - now

    if time_left <= time_shift_minutes * 60:
        return 'expired', time_left

    return 'valid', time_left


def generate_user_agent_from_policy(policy):
    """
    Generates a User-Agent string based on settings in the policy.
    Checks 'direct_docker_cli_policy' and 'direct_batch_cli_policy'.
    Falls back to a default if no policy is provided.
    """
    # Check both possible policy keys for the settings.
    direct_policy = policy.get('direct_docker_cli_policy', {}) or policy.get('direct_batch_cli_policy', {})
    template = direct_policy.get('user_agent_template')
    version_range = direct_policy.get('user_agent_version_range')

    if template and version_range and isinstance(version_range, list) and len(version_range) == 2:
        major_version = random.randint(version_range[0], version_range[1])
        return template.format(major_version=major_version)

    # Fallback to a generic UA if policy is not configured
    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'


def update_dict(d, u):
    """Recursively update a dictionary."""
    for k, v in u.items():
        if isinstance(v, collections.abc.Mapping):
            d[k] = update_dict(d.get(k, {}), v)
        else:
            d[k] = v
    return d


def load_policy(policy_file, policy_name=None):
    """Load a policy from a YAML file."""
    logger = logging.getLogger(__name__)
    try:
        with open(policy_file, 'r', encoding='utf-8') as f:
            # If a policy name is given, look for that specific document
            if policy_name:
                docs = list(yaml.safe_load_all(f))
                for doc in docs:
                    if isinstance(doc, dict) and doc.get('name') == policy_name:
                        return doc
                raise ValueError(f"Policy '{policy_name}' not found in {policy_file}")
            # Otherwise, load the first document
            return yaml.safe_load(f)
    except (IOError, yaml.YAMLError, ValueError) as e:
        logger.error(f"Failed to load policy file {policy_file}: {e}")
        sys.exit(1)


def apply_overrides(policy, overrides):
    """Apply command-line overrides to the policy."""
    logger = logging.getLogger(__name__)
    for override in overrides:
        try:
            key, value = override.split('=', 1)
            keys = key.split('.')

            # Try to parse as JSON/YAML if it looks like a list or dict, otherwise treat as scalar
            if (value.startswith('[') and value.endswith(']')) or \
               (value.startswith('{') and value.endswith('}')):
                try:
                    value = yaml.safe_load(value)
                except yaml.YAMLError:
                    logger.warning(f"Could not parse override value '{value}' as YAML. Treating as a string.")
            else:
                # Try to auto-convert scalar value type
                if value.lower() == 'true':
                    value = True
                elif value.lower() == 'false':
                    value = False
                elif value.lower() == 'null':
                    value = None
                else:
                    try:
                        value = int(value)
                    except ValueError:
                        try:
                            value = float(value)
                        except ValueError:
                            pass # Keep as string

            d = policy
            for k in keys[:-1]:
                d = d.setdefault(k, {})
            d[keys[-1]] = value
        except ValueError:
            logger.error(f"Invalid override format: '{override}'. Use 'key.subkey=value'.")
            sys.exit(1)
    return policy


def display_effective_policy(policy, name, args, sources=None, profile_names=None, original_workers_setting=None):
    """Prints a human-readable summary of the effective policy."""
    logger = logging.getLogger(__name__)
    logger.info(f"--- Effective Policy: {name} ---")
    settings = policy.get('settings', {})
    exec_control = policy.get('execution_control', {})
    orchestration_mode = settings.get('orchestration_mode')

    logger.info(f"Mode: {settings.get('mode', 'full_stack')}")
    if args and args.profile_prefix:
        logger.info(f"Profile Prefix (from CLI): {args.profile_prefix}")
    if profile_names:
        num_profiles = len(profile_names)
        logger.info(f"Profiles found: {num_profiles}")
        if num_profiles > 0:
            # Sort profiles for consistent display, show top 10
            sorted_profiles = sorted(profile_names)
            profiles_to_show = sorted_profiles[:10]
            logger.info(f"  (e.g., {', '.join(profiles_to_show)}{'...' if num_profiles > 10 else ''})")

    workers_display = str(exec_control.get('workers', 1))
    if original_workers_setting == 'auto':
        workers_display = f"auto (calculated: {workers_display})"
    logger.info(f"Workers: {workers_display}")

    sleep_cfg = exec_control.get('sleep_between_tasks', {})
    sleep_min = sleep_cfg.get('min_seconds')
    if sleep_min is not None:
        sleep_max = sleep_cfg.get('max_seconds')
        if sleep_max is None:
            sleep_max = sleep_min

        if sleep_max < sleep_min:
            logger.info(f"Sleep between tasks (per worker): {sleep_max}s (fixed; max < min)")
        elif sleep_max > sleep_min:
            logger.info(f"Sleep between tasks (per worker): {sleep_min}-{sleep_max}s (random)")
        else:
            logger.info(f"Sleep between tasks (per worker): {sleep_min}s")

    run_until = exec_control.get('run_until', {})
    run_conditions = []
    if 'minutes' in run_until:
        run_conditions.append(f"for {run_until['minutes']} minutes")
    if 'requests' in run_until:
        run_conditions.append(f"until {run_until['requests']} total requests")
    if 'cycles' in run_until:
        run_conditions.append(f"for {run_until['cycles']} cycles")

    if run_conditions:
        logger.info(f"Run condition: Stop after running {' or '.join(run_conditions)}.")
        if 'minutes' in run_until and 'cycles' not in run_until:
            logger.info("Will continuously cycle through sources until time limit is reached.")
    elif orchestration_mode in ['direct_batch_cli', 'direct_download_cli', 'direct_docker_cli']:
        logger.info("Run condition: Stop after all source URLs/tasks have been processed once.")
    else:
        logger.warning("WARNING: No 'run_until' condition is set. This test will run forever unless stopped manually.")
        logger.info("Run condition: No stop condition defined, will run indefinitely (until Ctrl+C).")

    # --- Rate Calculation ---
    if sources:
        workers = exec_control.get('workers', 1)
        num_sources = len(profile_names) if profile_names else len(sources)

        min_sleep = sleep_cfg.get('min_seconds', 0)
        max_sleep = sleep_cfg.get('max_seconds') or min_sleep
        avg_sleep_per_task = (min_sleep + max_sleep) / 2

        # Assume an average task duration. This is a major assumption.
        mode = settings.get('mode', 'full_stack')
        assumptions = exec_control.get('assumptions', {})

        assumed_fetch_duration = 0
        if mode in ['full_stack', 'fetch_only']:
            assumed_fetch_duration = assumptions.get('fetch_task_duration', 12 if mode == 'full_stack' else 3)

        assumed_download_duration = 0
        if mode in ['full_stack', 'download_only']:
            # This assumes the total time to download all formats for a single source.
            assumed_download_duration = assumptions.get('download_task_duration', 60)

        total_assumed_task_duration = assumed_fetch_duration + assumed_download_duration

        if workers > 0 and total_assumed_task_duration > 0:
            total_time_per_task = total_assumed_task_duration + avg_sleep_per_task
            tasks_per_minute_per_worker = 60 / total_time_per_task
            total_tasks_per_minute = tasks_per_minute_per_worker * workers

            logger.info("--- Rate Estimation ---")
            logger.info(f"Source count: {num_sources}")
            if mode in ['full_stack', 'fetch_only']:
                logger.info(f"Est. fetch time per source: {assumed_fetch_duration}s (override via execution_control.assumptions.fetch_task_duration)")
            if mode in ['full_stack', 'download_only']:
                logger.info(f"Est. download time per source: {assumed_download_duration}s (override via execution_control.assumptions.download_task_duration)")
                logger.info("  (Note: This assumes total time for all formats per source)")

            logger.info(f"Est. sleep per task: {avg_sleep_per_task:.1f}s")
            logger.info(f"==> Expected task rate: ~{total_tasks_per_minute:.2f} tasks/minute ({workers} workers * {tasks_per_minute_per_worker:.2f} tasks/min/worker)")

            target_rate_cfg = exec_control.get('target_rate', {})
            target_reqs = target_rate_cfg.get('requests')
            target_mins = target_rate_cfg.get('per_minutes')
            if target_reqs and target_mins:
                target_rpm = target_reqs / target_mins
                logger.info(f"Target rate: {target_rpm:.2f} tasks/minute")
                if total_tasks_per_minute < target_rpm * 0.8:
                    logger.warning("Warning: Expected rate is significantly lower than target rate.")
                    logger.warning("Consider increasing workers, reducing sleep, or checking task performance.")

    logger.info("---------------------------------")
    time.sleep(2)  # Give user time to read


def list_policies():
    """Scans the policies directory and prints a list of available policies."""
    policies_dir = os.path.join(_PROJECT_ROOT, 'policies')

    if not os.path.isdir(policies_dir):
        print(f"Error: Policies directory not found at '{policies_dir}'", file=sys.stderr)
        return 1

    print("Available Policies:")
    print("=" * 20)

    policy_files = sorted(Path(policies_dir).glob('*.yaml'))
    if not policy_files:
        print("No policy files (.yaml) found.")
        return 0

    for policy_file in policy_files:
        print(f"\n--- File: {policy_file.relative_to(_PROJECT_ROOT)} ---")
        try:
            with open(policy_file, 'r', encoding='utf-8') as f:
                content = f.read()

            # Split into documents. The separator is a line that is exactly '---'.
            documents = re.split(r'^\-\-\-$', content, flags=re.MULTILINE)

            found_any_in_file = False
            for doc in documents:
                doc = doc.strip()
                if not doc:
                    continue

                lines = doc.split('\n')
                policy_name = None
                description_lines = []

                # Find name and description
                for i, line in enumerate(lines):
                    if line.strip().startswith('name:'):
                        policy_name = line.split(':', 1)[1].strip()

                        # Look backwards for comments
                        j = i - 1
                        current_desc_block = []
                        while j >= 0 and lines[j].strip().startswith('#'):
                            comment = lines[j].strip().lstrip('#').strip()
                            current_desc_block.insert(0, comment)
                            j -= 1

                        if current_desc_block:
                            description_lines = current_desc_block
                        break

                if policy_name:
                    found_any_in_file = True
                    print(f"  - Name: {policy_name}")
                    if description_lines:
                        # Heuristic to clean up "Policy: " prefix
                        if description_lines[0].lower().startswith('policy:'):
                            description_lines[0] = description_lines[0][len('policy:'):].strip()

                        print(f"    Description: {description_lines[0]}")
                        for desc_line in description_lines[1:]:
                            print(f"                 {desc_line}")
                    else:
                        print("    Description: (No description found)")

                    relative_path = policy_file.relative_to(_PROJECT_ROOT)
                    print(f"    Usage: --policy {relative_path} --policy-name {policy_name}")

            if not found_any_in_file:
                print("  (No named policies found in this file)")

        except Exception as e:
            print(f"  Error parsing {policy_file.name}: {e}")

    return 0