yt-dlp-dags/tools/generate-profile-setup-policy.py

335 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Generates the profile setup policy YAML from the main cluster configuration file.
This script reads the worker configurations from a cluster.yml file, aggregates
all profile definitions, and generates a policy file that can be used by the
`ytops-client setup-profiles` command. This centralizes profile management
in the cluster configuration file.
"""
import yaml
import sys
import os
from collections import OrderedDict
# To ensure YAML dumps dicts in the order they are created
def represent_ordereddict(dumper, data):
value = []
for item_key, item_value in data.items():
node_key = dumper.represent_data(item_key)
node_value = dumper.represent_data(item_value)
value.append((node_key, node_value))
return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)
yaml.add_representer(OrderedDict, represent_ordereddict)
# Custom list type and representer to achieve flow style for inner lists
class FlowList(list):
pass
def flow_style_list_representer(dumper, data):
return dumper.represent_sequence(u'tag:yaml.org,2002:seq', data, flow_style=True)
yaml.add_representer(FlowList, flow_style_list_representer)
# Custom string type and representer for double-quoted strings
class QuotedString(str):
pass
def quoted_string_representer(dumper, data):
return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, style='"')
yaml.add_representer(QuotedString, quoted_string_representer)
def load_cluster_config(config_path):
"""Load cluster configuration from YAML file"""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def generate_policy(cluster_config, output_path):
"""Generate the profile setup policy file using common pools."""
shadowsocks_proxies = cluster_config.get('shadowsocks_proxies', {})
all_workers = cluster_config.get('workers', {})
common_pools = []
# Aggregate profile pools from all workers
for worker_name, worker_config in all_workers.items():
for pool in worker_config.get('profile_pools', []):
proxy_service = pool['proxy_service']
if proxy_service not in shadowsocks_proxies:
print(f"Warning: Proxy service '{proxy_service}' for profile pool '{pool['prefixes']}' on worker '{worker_name}' not found in global shadowsocks_proxies. Skipping.", file=sys.stderr)
continue
proxy_port = shadowsocks_proxies[proxy_service]['local_port']
proxy_string = f"{proxy_service}:{proxy_port}"
pool_entry = OrderedDict([
('prefixes', sorted(pool['prefixes'])),
('proxy', proxy_string),
('count', pool['count'])
])
common_pools.append(pool_entry)
# Sort the pools by the first prefix in each pool for consistent file output
sorted_common_pools = sorted(common_pools, key=lambda x: x['prefixes'][0])
# Write the policy file manually to ensure exact formatting and comments
with open(output_path, 'w') as f:
f.write("# Configuration for setting up profiles for a simulation or test run.\n")
f.write("# This file is used by the `bin/ytops-client setup-profiles` command.\n")
f.write("# It uses a common pool definition to avoid repetition.\n\n")
f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
f.write("# !!! DO NOT EDIT. Your changes will be overwritten. !!!\n")
f.write("# !!! Edit cluster.green.yml and re-run the generator instead. !!!\n\n")
f.write("simulation_parameters:\n")
f.write(" # --- Common Redis settings for all tools ---\n")
f.write(" # The environment name ('env') is now specified in each setup block below.\n")
f.write(' env_file: ".env" # Optional: path to a .env file.\n')
f.write("\n# --- Common Pool Definitions ---\n")
f.write("# Define the profile pools once. They will be created in both\n")
f.write("# the auth and download simulation environments.\n")
f.write("# The `setup-profiles` tool must be updated to support this format.\n")
f.write("common_pools:\n")
for pool in sorted_common_pools:
prefixes_str = ", ".join([f'"{p}"' for p in pool['prefixes']])
f.write(f' - prefixes: [{prefixes_str}]\n')
f.write(f' proxy: "{pool["proxy"]}"\n')
f.write(f' count: {pool["count"]}\n')
f.write("\n# --- Profile setup for the AUTHENTICATION simulation ---\n")
f.write("auth_profile_setup:\n")
f.write(' env: "sim_auth"\n')
f.write(" cleanup_before_run: true\n")
f.write(" # The setup tool will use the 'common_pools' defined above.\n")
f.write(" use_common_pools: true\n")
f.write("\n# --- Profile setup for the DOWNLOAD simulation ---\n")
f.write("download_profile_setup:\n")
f.write(' env: "sim_download"\n')
f.write(" cleanup_before_run: true\n")
f.write(" # The setup tool will also use the 'common_pools' defined above.\n")
f.write(" use_common_pools: true\n")
print(f"Successfully generated profile setup policy at: {output_path}")
def generate_enforcer_policy(cluster_config, output_path):
"""Generate the enforcer policy file."""
all_workers = cluster_config.get('workers', {})
enforcement_pools = []
for worker_name, worker_config in sorted(all_workers.items()):
all_prefixes = []
for pool in worker_config.get('profile_pools', []):
all_prefixes.extend(pool.get('prefixes', []))
if not all_prefixes:
continue
pool_entry = OrderedDict([
('name', f"server_{worker_name}"),
('profile_group_patterns', sorted(list(set(all_prefixes)))),
('max_active_profiles', 1)
])
enforcement_pools.append(pool_entry)
with open(output_path, 'w') as f:
f.write("# Policy for the unified simulation enforcer.\n")
f.write("# This file is used by `bin/ytops-client policy-enforcer --live` to manage\n")
f.write("# both the authentication and download simulation environments from a single process.\n\n")
f.write("# !!! THIS FILE IS AUTO-GENERATED by tools/generate-profile-setup-policy.py !!!\n")
f.write("# !!! DO NOT EDIT. Your changes will be overwritten. !!!\n")
f.write("# !!! Edit cluster.green.yml and re-run the generator instead. !!!\n\n")
f.write("simulation_parameters:\n")
f.write(" # --- Common Redis settings for all tools ---\n")
f.write(" # The enforcer will connect to two different Redis environments (key prefixes)\n")
f.write(" # based on these settings, applying the corresponding policies to each.\n")
f.write(' env_file: ".env"\n')
f.write(' auth_env: "sim_auth"\n')
f.write(' download_env: "sim_download"\n')
f.write(" \n")
f.write(" # How often the enforcer should wake up and apply all policies.\n")
f.write(" interval_seconds: 2\n\n")
f.write("# --- Common & Pool-specific Settings ---\n")
f.write("# Common settings are applied to all profile groups discovered via the pools below.\n")
f.write("# A pool can optionally override these settings by defining its own 'group_settings' block.\n")
f.write("common_group_settings:\n")
f.write(" auth:\n")
f.write(" max_active_profiles: 1\n")
f.write(" rotate_after_requests: 5\n")
f.write(" rest_duration_minutes_on_rotation: 0.20\n")
f.write(" wait_download_finish_per_group: true\n")
f.write(" max_wait_for_downloads_minutes: 240\n")
f.write(" download:\n")
f.write(" max_active_profiles: 1\n")
f.write(" rotate_after_requests: 0\n")
f.write(" rest_duration_minutes_on_rotation: 0.2\n\n")
f.write("# Defines pools of profile groups with their own concurrency limits.\n")
f.write("enforcement_pools:\n")
for pool in enforcement_pools:
f.write(f' - name: "{pool["name"]}"\n')
patterns_str = ", ".join([f'"{p}"' for p in pool['profile_group_patterns']])
f.write(f' profile_group_patterns: [{patterns_str}]\n')
f.write(f' max_active_profiles: {pool["max_active_profiles"]}\n')
rest_of_file = """
# --- Policies for the Authentication Simulation ---
auth_policy_enforcer_config:
# Ban if 2 failures occur within a 1-minute window.
#ban_on_failures: 2
#ban_on_failures_window_minutes: 1
# The standard rest policy is disabled, as rotation is handled by the profile group.
# New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour.
rate_limit_requests: 0
rate_limit_window_minutes: 60
rate_limit_rest_duration_minutes: 5
rest_after_requests: 0
rest_duration_minutes: 10
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# The settings below should be configured to respect these limits.
# New setting for load balancing across profile groups.
# "longest_idle": Activates the profile that has been idle the longest across all groups (based on last_used time).
# This is a global FIFO strategy that effectively cycles through profiles regardless of their group.
# "least_loaded": Prioritizes activating a profile from the group with the fewest pending downloads.
# If multiple groups have zero pending downloads, it acts as a FIFO queue, activating
# the one that finished its last download batch the earliest. This is useful when you want
# to ensure a group finishes its entire workload before another group starts.
profile_selection_strategy: "longest_idle"
# The 'global_max_active_profiles' setting is now superseded by the per-pool limits
# defined in the 'enforcement_pools' section.
# The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
# The enforcer logic should be updated to read from there.
proxy_work_minutes: 0
proxy_rest_duration_minutes: 0
# Global maximum time a proxy can be active before being rested, regardless of
# other rules. Acts as a safety net. Set to 0 to disable.
max_global_proxy_active_minutes: 0
rest_duration_on_max_active: 10
# Proxy-level ban on failure burst is disabled.
proxy_ban_on_failures: 0
proxy_ban_window_minutes: 2
# Clean up locks held for more than 16 minutes (960s) to prevent stuck workers.
# This should be longer than the docker container timeout (15m).
unlock_stale_locks_after_seconds: 960
# A short post-task cooldown for auth simulation profiles. When a batch is finished,
# the profile is put into COOLDOWN briefly. This prevents a worker from immediately
# re-locking the same profile, giving the policy enforcer a window to perform rotation.
unlock_cooldown_seconds: 0
# --- Cross-simulation synchronization ---
# This section is simplified because the link between auth and download profiles
# is now defined in the `profile_group_definitions`.
cross_simulation_sync:
# Which states to synchronize from auth to download.
sync_states:
- "BANNED"
# If true, a BANNED state on an auth profile will force the download profile to also be BANNED.
enforce_auth_lead: true
# CRITICAL: Ensures the correct download profile GROUP is active.
sync_active_profile: true
# When an auth profile is in the 'waiting_downloads' state, ensure the matching download profile is active.
sync_waiting_downloads: true
# --- Policies for the Download Simulation ---
download_policy_enforcer_config:
# Ban if 1 failure occurs within a 1-minute window.
ban_on_failures: 1
ban_on_failures_window_minutes: 1
# Standard rest policy is disabled in favor of group rotation.
# New rate limit policy to enforce requests-per-hour limits.
# For guest sessions, the limit is ~300 videos/hour. We set it slightly lower to be safe.
rate_limit_requests: 280
rate_limit_window_minutes: 60
rate_limit_rest_duration_minutes: 5
rest_after_requests: 0
rest_duration_minutes: 20
# NOTE on Rate Limits: With the default yt-dlp settings, the rate limit for guest
# sessions is ~300 videos/hour (~1000 webpage/player requests per hour).
# For accounts, it is ~2000 videos/hour (~4000 webpage/player requests per hour).
# The settings below should be configured to respect these limits.
# The 'profile_groups' section is now inherited from 'profile_group_definitions' above.
# The enforcer logic should be updated to read from there.
# Time-based proxy rules are disabled.
proxy_work_minutes: 0
proxy_rest_duration_minutes: 10
# Global maximum time a proxy can be active before being rested, regardless of
# other rules. Acts as a safety net. Set to 0 to disable.
max_global_proxy_active_minutes: 0
rest_duration_on_max_active: 10
# Proxy-level ban on failure burst is disabled.
proxy_ban_on_failures: 3
proxy_ban_window_minutes: 1
# Clean up download locks held for more than 16 minutes (960s) to allow for long downloads.
# This should be longer than the docker container timeout (15m).
unlock_stale_locks_after_seconds: 960
# After a profile is used for a download, unlock it but put it in COOLDOWN
# state for 2-3s. This is enforced by the worker, which reads this config from Redis.
unlock_cooldown_seconds: [2, 3]
"""
with open(output_path, 'a') as f:
f.write(rest_of_file)
print(f"Successfully generated enforcer policy at: {output_path}")
def main():
if len(sys.argv) < 3 or len(sys.argv) > 4:
print("Usage: ./tools/generate-profile-setup-policy.py <cluster-config-file> <output-profile-policy-file> [<output-enforcer-policy-file>]")
sys.exit(1)
config_path = sys.argv[1]
profile_output_path = sys.argv[2]
if not os.path.exists(config_path):
print(f"Error: Cluster configuration file not found at '{config_path}'", file=sys.stderr)
sys.exit(1)
cluster_config = load_cluster_config(config_path)
generate_policy(cluster_config, profile_output_path)
if len(sys.argv) == 4:
enforcer_output_path = sys.argv[3]
generate_enforcer_policy(cluster_config, enforcer_output_path)
if __name__ == "__main__":
main()