yt-dlp-dags/tools/generate-profile-setup-policy.py

#!/usr/bin/env python3
"""
Generates the profile setup policy YAML from the main cluster configuration file.

This script reads the worker configurations from a cluster.yml file, aggregates
all profile definitions, and generates a policy file that can be used by the
`ytops-client setup-profiles` command. This centralizes profile management
in the cluster configuration file.
"""

import yaml
import sys
import os
from collections import OrderedDict

# To ensure YAML dumps dicts in the order they are created
def represent_ordereddict(dumper, data):
    value = []
    for item_key, item_value in data.items():
        node_key = dumper.represent_data(item_key)
        node_value = dumper.represent_data(item_value)
        value.append((node_key, node_value))
    return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)

yaml.add_representer(OrderedDict, represent_ordereddict)


# Custom list type and representer to achieve flow style for inner lists
class FlowList(list):
    pass

def flow_style_list_representer(dumper, data):
    return dumper.represent_sequence(u'tag:yaml.org,2002:seq', data, flow_style=True)

yaml.add_representer(FlowList, flow_style_list_representer)


# Custom string type and representer for double-quoted strings
class QuotedString(str):
    pass

def quoted_string_representer(dumper, data):
    return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, style='"')

yaml.add_representer(QuotedString, quoted_string_representer)


def load_cluster_config(config_path):
    """Load cluster configuration from YAML file"""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

def generate_policy(cluster_config, output_path):
    """Generate the profile setup policy file using common pools."""

    shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {})
    all_workers = cluster_config.get('workers', {})

    common_pools = []

    # Aggregate profile pools from all workers
    for worker_name, worker_config in all_workers.items():
        for pool in worker_config.get('profile_pools', []):
            proxy_service = pool['proxy_service']
            if proxy_service not in shadowsocks_proxies:
                print(f"Warning: Proxy service '{proxy_service}' for profile pool '{pool['prefixes']}' on worker '{worker_name}' not found in global shadowsocks_proxies. Skipping.", file=sys.stderr)
                continue

            proxy_port = shadowsocks_proxies[proxy_service]['local_port']
            proxy_string = f"{proxy_service}:{proxy_port}"

            pool_entry = OrderedDict([
                ('prefixes', sorted(pool['prefixes'])),
                ('proxy', proxy_string),
                ('count', pool['count'])
            ])
            common_pools.append(pool_entry)

    # Sort the pools by the first prefix in each pool for consistent file output
    sorted_common_pools = sorted(common_pools, key=lambda x: x['prefixes'][0])

    # Write the policy file manually to ensure exact formatting and comments
    with open(output_path, 'w') as f:
        f.write("# Configuration for setting up profiles for a simulation or test run.\n")
        f.write("# This file is used by the `bin/ytops-client setup-profiles` command.\n")
        f.write("# It uses a common pool definition to avoid repetition.\n\n")
        f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
        f.write("# !!! DO NOT EDIT. Your changes will be overwritten.              !!!\n")
        f.write("# !!! Edit cluster.green.yml and re-run the generator instead.    !!!\n\n")

        f.write("simulation_parameters:\n")
        f.write("  # --- Common Redis settings for all tools ---\n")
        f.write("  # The environment name ('env') is now specified in each setup block below.\n")
        f.write('  env_file: ".env"            # Optional: path to a .env file.\n')

        f.write("\n# --- Common Pool Definitions ---\n")
        f.write("# Define the profile pools once. They will be created in both\n")
        f.write("# the auth and download simulation environments.\n")
        f.write("# The `setup-profiles` tool must be updated to support this format.\n")
        f.write("common_pools:\n")
        for pool in sorted_common_pools:
            prefixes_str = ", ".join([f'"{p}"' for p in pool['prefixes']])
            f.write(f'  - prefixes: [{prefixes_str}]\n')
            f.write(f'    proxy: "{pool["proxy"]}"\n')
            f.write(f'    count: {pool["count"]}\n')

        f.write("\n# --- Profile setup for the AUTHENTICATION simulation ---\n")
        f.write("auth_profile_setup:\n")
        f.write('  env: "sim_auth"\n')
        f.write("  cleanup_before_run: true\n")
        f.write("  # The setup tool will use the 'common_pools' defined above.\n")
        f.write("  use_common_pools: true\n")

        f.write("\n# --- Profile setup for the DOWNLOAD simulation ---\n")
        f.write("download_profile_setup:\n")
        f.write('  env: "sim_download"\n')
        f.write("  cleanup_before_run: true\n")
        f.write("  # The setup tool will also use the 'common_pools' defined above.\n")
        f.write("  use_common_pools: true\n")

    print(f"Successfully generated profile setup policy at: {output_path}")


def generate_enforcer_policy(cluster_config, output_path):
    """Generate the enforcer policy file."""
    all_workers = cluster_config.get('workers', {})

    enforcement_pools = []
    for worker_name, worker_config in sorted(all_workers.items()):
        all_prefixes = []
        for pool in worker_config.get('profile_pools', []):
            all_prefixes.extend(pool.get('prefixes', []))

        if not all_prefixes:
            continue

        pool_entry = OrderedDict([
            ('name', f"server_{worker_name}"),
            ('profile_group_patterns', sorted(list(set(all_prefixes)))),
            ('max_active_profiles', 1)
        ])
        enforcement_pools.append(pool_entry)

    with open(output_path, 'w') as f:
        f.write("# Policy for the unified simulation enforcer.\n")
        f.write("# This file is used by `bin/ytops-client policy-enforcer --live` to manage\n")
        f.write("# both the authentication and download simulation environments from a single process.\n\n")
        f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
        f.write("# !!! DO NOT EDIT. Your changes will be overwritten.              !!!\n")
        f.write("# !!! Edit cluster.green.yml and re-run the generator instead.    !!!\n\n")

        f.write("simulation_parameters:\n")
        f.write("  # --- Common Redis settings for all tools ---\n")
        f.write("  # The enforcer will connect to two different Redis environments (key prefixes)\n")
        f.write("  # based on these settings, applying the corresponding policies to each.\n")
        f.write('  env_file: ".env"\n')
        f.write('  auth_env: "sim_auth"\n')
        f.write('  download_env: "sim_download"\n')
        f.write("  \n")
        f.write("  # How often the enforcer should wake up and apply all policies.\n")
        f.write("  interval_seconds: 2\n\n")

        f.write("# --- Common & Pool-specific Settings ---\n")
        f.write("# Common settings are applied to all profile groups discovered via the pools below.\n")
        f.write("# A pool can optionally override these settings by defining its own 'group_settings' block.\n")
        f.write("common_group_settings:\n")
        f.write("  auth:\n")
        f.write("    max_active_profiles: 1\n")
        f.write("    rotate_after_requests: 5\n")
        f.write("    rest_duration_minutes_on_rotation: 0.20\n")
        f.write("    wait_download_finish_per_group: true\n")
        f.write("    max_wait_for_downloads_minutes: 240\n")
        f.write("  download:\n")
        f.write("    max_active_profiles: 1\n")
        f.write("    rotate_after_requests: 0\n")
        f.write("    rest_duration_minutes_on_rotation: 0.2\n\n")

        f.write("# Defines pools of profile groups with their own concurrency limits.\n")
        f.write("enforcement_pools:\n")

        for pool in enforcement_pools:
            f.write(f'  - name: "{pool["name"]}"\n')
            patterns_str = ", ".join([f'"{p}"' for p in pool['profile_group_patterns']])
            f.write(f'    profile_group_patterns: [{patterns_str}]\n')
            f.write(f'    max_active_profiles: {pool["max_active_profiles"]}\n')

    rest_of_file = """
# --- Policies for the Authentication Simulation ---
auth_policy_enforcer_config:

  # Ban if 2 failures occur within a 1-minute window.
  #ban_on_failures: 2
  #ban_on_failures_window_minutes: 1

  # The standard rest policy is disabled, as rotation is handled by the profile group.

  # New rate limit policy to enforce requests-per-hour limits.
  # For guest sessions, the limit is ~300 videos/hour.
  rate_limit_requests: 0
  rate_limit_window_minutes: 60
  rate_limit_rest_duration_minutes: 5

  rest_after_requests: 0
  rest_duration_minutes: 10

  # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
  # sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
  # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
  # The settings below should be configured to respect these limits.

  # New setting for load balancing across profile groups.
  # "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time).
  #                 This is a global FIFO strategy that effectively cycles through profiles regardless of their group.
  # "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads.
  #                 If multiple groups have zero pending downloads, it acts as a FIFO queue, activating
  #                 the one that finished its last download batch the earliest. This is useful when you want
  #                 to ensure a group finishes its entire workload before another group starts.
  profile_selection_strategy: "longest_idle"

  # The 'global_max_active_profiles' setting is now superseded by the per-pool limits
  # defined in the 'enforcement_pools' section.

  # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
  # The enforcer logic should be updated to read from there.

  proxy_work_minutes: 0
  proxy_rest_duration_minutes: 0

  # Global maximum time a proxy can be active before being rested, regardless of
  # other rules. Acts as a safety net. Set to 0 to disable.
  max_global_proxy_active_minutes: 0
  rest_duration_on_max_active: 10

  # Proxy-level ban on failure burst is disabled.
  proxy_ban_on_failures: 0
  proxy_ban_window_minutes: 2

  # Clean up locks held for more than 16 minutes (960s) to prevent stuck workers.
  # This should be longer than the docker container timeout (15m).
  unlock_stale_locks_after_seconds: 960

  # A short post-task cooldown for auth simulation profiles. When a batch is finished,
  # the profile is put into COOLDOWN briefly. This prevents a worker from immediately
  # re-locking the same profile, giving the policy enforcer a window to perform rotation.
  unlock_cooldown_seconds: 0

# --- Cross-simulation synchronization ---
# This section is simplified because the link between auth and download profiles
# is now defined in the `profile_group_definitions`.
cross_simulation_sync:
  # Which states to synchronize from auth to download.
  sync_states:
    - "BANNED"
  # If true, a BANNED state on an auth profile will force the download profile to also be BANNED.
  enforce_auth_lead: true
  # CRITICAL: Ensures the correct download profile GROUP is active.
  sync_active_profile: true
  # When an auth profile is in the 'waiting_downloads' state, ensure the matching download profile is active.
  sync_waiting_downloads: true

# --- Policies for the Download Simulation ---
download_policy_enforcer_config:

  # Ban if 1 failure occurs within a 1-minute window.
  ban_on_failures: 1
  ban_on_failures_window_minutes: 1

  # Standard rest policy is disabled in favor of group rotation.

  # New rate limit policy to enforce requests-per-hour limits.
  # For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe.
  rate_limit_requests: 280
  rate_limit_window_minutes: 60
  rate_limit_rest_duration_minutes: 5
  rest_after_requests: 0
  rest_duration_minutes: 20

  # NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
  # sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
  # For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
  # The settings below should be configured to respect these limits.

  # The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
  # The enforcer logic should be updated to read from there.

  # Time-based proxy rules are disabled.
  proxy_work_minutes: 0
  proxy_rest_duration_minutes: 10

  # Global maximum time a proxy can be active before being rested, regardless of
  # other rules. Acts as a safety net. Set to 0 to disable.
  max_global_proxy_active_minutes: 0
  rest_duration_on_max_active: 10

  # Proxy-level ban on failure burst is disabled.
  proxy_ban_on_failures: 3
  proxy_ban_window_minutes: 1

  # Clean up download locks held for more than 16 minutes (960s) to allow for long downloads.
  # This should be longer than the docker container timeout (15m).
  unlock_stale_locks_after_seconds: 960

  # After a profile is used for a download, unlock it but put it in COOLDOWN
  # state for 2-3s. This is enforced by the worker, which reads this config from Redis.
  unlock_cooldown_seconds: [2, 3]
"""
    with open(output_path, 'a') as f:
        f.write(rest_of_file)

    print(f"Successfully generated enforcer policy at: {output_path}")


def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-profile-policy-file> [<output-enforcer-policy-file>]")
        sys.exit(1)

    config_path = sys.argv[1]
    profile_output_path = sys.argv[2]

    if not os.path.exists(config_path):
        print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr)
        sys.exit(1)

    cluster_config = load_cluster_config(config_path)
    generate_policy(cluster_config, profile_output_path)

    if len(sys.argv) == 4:
        enforcer_output_path = sys.argv[3]
        generate_enforcer_policy(cluster_config, enforcer_output_path)


if __name__ == "__main__":
    main()