yt-dlp-dags/airflow/generate_envoy_config.py
2025-08-26 18:00:55 +03:00

341 lines
17 KiB
Python

#!/usr/bin/env python3
import os
import sys
import json
import re
try:
from jinja2 import Environment, FileSystemLoader
except ImportError:
print("FATAL: jinja2 is not installed. Please run 'pip install jinja2'.", file=sys.stderr)
exit(1)
import logging
import ipaddress
from typing import Optional
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_ip_address(address: str) -> bool:
"""Checks if a given string is a valid IP address (IPv4 or IPv6)."""
if not address:
return False
try:
ipaddress.ip_address(address)
return True
except ValueError:
return False
def load_dotenv(dotenv_path):
"""
Loads environment variables from a .env file.
Does not override existing environment variables from the system.
"""
if not os.path.exists(dotenv_path):
logging.warning(f".env file not found at {dotenv_path}. Using system environment variables or defaults.")
return
try:
with open(dotenv_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Remove surrounding quotes which are common in .env files
if (value.startswith('"') and value.endswith('"')) or \
(value.startswith("'") and value.endswith("'")):
value = value[1:-1]
# os.environ only takes strings
value = str(value)
if key not in os.environ:
os.environ[key] = value
logging.info(f"Successfully loaded variables from {dotenv_path}")
except Exception as e:
logging.error(f"Failed to read or parse {dotenv_path}: {e}")
# Continue, will use defaults or system env vars
def _get_port_from_proxy_url(url: str) -> Optional[str]:
"""Extracts the port from a proxy URL string."""
if not url or not isinstance(url, str):
return None
match = re.search(r':(\d+)$', url.strip())
return match.group(1) if match else None
def expand_env_vars(value: str) -> str:
"""
Expands environment variables in a string, including default values.
Supports ${VAR} and ${VAR:-default}.
"""
if not isinstance(value, str):
return value
# Regex to find ${VAR:-default} or ${VAR}
pattern = re.compile(r'\$\{(?P<var>\w+)(?::-(?P<default>.*?))?\}')
def replacer(match):
var_name = match.group('var')
default_value = match.group('default')
# Get value from os.environ, or use default, or empty string
return os.getenv(var_name, default_value if default_value is not None else '')
return pattern.sub(replacer, value)
def generate_configs():
"""
Generates envoy.yaml, docker-compose.camoufox.yaml, and camoufox_endpoints.json
from Jinja2 templates and environment variables.
"""
try:
# --- Load .env file ---
script_dir = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(script_dir, '.env')
load_dotenv(dotenv_path)
# --- Common Configuration ---
ytdlp_workers_str = os.getenv('YTDLP_WORKERS', '3').strip()
try:
# Handle empty string case by defaulting to 3, otherwise convert to int.
worker_count = int(ytdlp_workers_str) if ytdlp_workers_str else 3
except (ValueError, TypeError):
logging.warning(f"Invalid value for YTDLP_WORKERS: '{ytdlp_workers_str}'. Defaulting to 3.")
worker_count = 3
if worker_count == 0:
worker_count = os.cpu_count() or 1
logging.info(f"YTDLP_WORKERS is 0, auto-detected {worker_count} CPU cores for worker and camoufox config.")
config_dir = os.path.join(script_dir, 'config')
os.makedirs(config_dir, exist_ok=True)
env = Environment(loader=FileSystemLoader(script_dir), trim_blocks=True, lstrip_blocks=True)
# Get service role from environment to determine what to generate
service_role = os.getenv('SERVICE_ROLE', 'all-in-one')
logging.info(f"Service role for generation: '{service_role}'")
# --- Camoufox Configuration (only for worker/all-in-one roles) ---
camoufox_proxies = []
expanded_camoufox_proxies_str = ""
if service_role != 'management':
logging.info("--- Generating Camoufox (Remote Browser) Configuration ---")
camoufox_proxies_str = os.getenv('CAMOUFOX_PROXIES')
if not camoufox_proxies_str:
logging.warning("CAMOUFOX_PROXIES environment variable not set. No camoufox instances will be generated.")
else:
# Expand environment variables within the string before splitting
expanded_camoufox_proxies_str = expand_env_vars(camoufox_proxies_str)
logging.info(f"Expanded CAMOUFOX_PROXIES from '{camoufox_proxies_str}' to '{expanded_camoufox_proxies_str}'")
camoufox_proxies = [{'url': p.strip()} for p in expanded_camoufox_proxies_str.split(',') if p.strip()]
logging.info(f"Found {len(camoufox_proxies)} proxy/proxies for Camoufox.")
logging.info(f"Each Camoufox instance will support {worker_count} concurrent browser sessions.")
logging.info(f"Total browser sessions supported on this worker: {len(camoufox_proxies) * worker_count}")
vnc_password = os.getenv('VNC_PASSWORD', 'supersecret')
base_vnc_port = int(os.getenv('CAMOUFOX_BASE_VNC_PORT', 5901))
camoufox_port = int(os.getenv('CAMOUFOX_PORT', 12345))
camoufox_backend_prefix = os.getenv('CAMOUFOX_BACKEND_PREFIX', 'camoufox-')
# --- Generate docker-compose.camoufox.yaml ---
compose_output_file = os.path.join(script_dir, 'docker-compose.camoufox.yaml')
# Generate the compose file directly without template
with open(compose_output_file, 'w') as f:
f.write("# THIS FILE IS AUTO-GENERATED BY generate_envoy_config.py\n")
f.write("# DO NOT EDIT MANUALLY.\n")
f.write("#\n")
f.write("# It contains the service definitions for the camoufox instances\n")
f.write("# and adds the necessary dependencies to the main services.\n")
f.write("services:\n\n")
# Generate services for each proxy
for i, proxy in enumerate(camoufox_proxies):
service_name = f"camoufox-{i+1}"
# Each container gets its own unique range of ports to avoid conflicts
container_base_port = camoufox_port + i * worker_count
host_base_port = container_base_port
f.write(f" {service_name}:\n")
f.write(f" build:\n")
f.write(f" context: ./camoufox\n")
f.write(f" dockerfile: Dockerfile\n")
f.write(f" args:\n")
f.write(f" VNC_PASSWORD: {vnc_password}\n")
f.write(f" image: camoufox:latest\n")
f.write(f" container_name: ytdlp-ops-{service_name}-1\n")
f.write(f" restart: unless-stopped\n")
f.write(f" shm_size: '2gb' # Mitigates browser crashes due to shared memory limitations\n")
f.write(f" ports:\n")
f.write(f" - \"{host_base_port}-{host_base_port + worker_count - 1}:{container_base_port}-{container_base_port + worker_count - 1}\"\n")
f.write(f" environment:\n")
f.write(f" - DISPLAY=:99\n")
f.write(f" - MOZ_HEADLESS_STACKSIZE=2097152\n")
f.write(f" - CAMOUFOX_MAX_MEMORY_MB=2048\n")
f.write(f" - CAMOUFOX_MAX_CONCURRENT_CONTEXTS=8\n")
f.write(f" - CAMOUFOX_RESTART_THRESHOLD_MB=1500\n")
f.write(f" volumes:\n")
f.write(f" - /tmp/.X11-unix:/tmp/.X11-unix:rw\n")
f.write(f" - camoufox-data-{i+1}:/app/context-data\n")
f.write(f" - camoufox-browser-cache:/root/.cache/ms-playwright # Persist browser binaries\n")
f.write(f" command: [\n")
f.write(f" \"--ws-host\", \"0.0.0.0\",\n")
f.write(f" \"--port\", \"{container_base_port}\",\n")
f.write(f" \"--num-instances\", \"{worker_count}\",\n")
f.write(f" \"--ws-path\", \"mypath\",\n")
f.write(f" \"--proxy-url\", \"{proxy['url']}\",\n")
f.write(f" \"--headless\",\n")
f.write(f" \"--monitor-resources\",\n")
f.write(f" \"--memory-restart-threshold\", \"1800\",\n")
f.write(f" \"--preferences\", \"layers.acceleration.disabled=true,dom.ipc.processCount=2,media.memory_cache_max_size=102400,browser.cache.memory.capacity=102400\"\n")
f.write(f" ]\n")
f.write(f" deploy:\n")
f.write(f" resources:\n")
f.write(f" limits:\n")
f.write(f" memory: 2.5G\n")
f.write(f" logging:\n")
f.write(f" driver: \"json-file\"\n")
f.write(f" options:\n")
f.write(f" max-size: \"100m\"\n")
f.write(f" max-file: \"3\"\n")
f.write(f" networks:\n")
f.write(f" - proxynet\n\n")
# Add camoufox-group service that depends on all camoufox instances
if camoufox_proxies:
f.write(" camoufox-group:\n")
f.write(" image: alpine:latest\n")
f.write(" command: [\"echo\", \"Camoufox group ready.\"]\n")
f.write(" restart: \"no\"\n")
f.write(" depends_on:\n")
for i in range(len(camoufox_proxies)):
f.write(f" - camoufox-{i+1}\n")
f.write(" networks:\n")
f.write(" - proxynet\n\n")
# Write volumes section
f.write("volumes:\n")
for i in range(len(camoufox_proxies)):
f.write(f" camoufox-data-{i+1}:\n")
if camoufox_proxies:
f.write(" camoufox-browser-cache:\n")
f.write("\n")
# Write networks section
f.write("networks:\n")
f.write(" proxynet:\n")
f.write(" name: airflow_proxynet\n")
f.write(" external: true\n")
logging.info(f"Successfully generated {compose_output_file} with {len(camoufox_proxies)} camoufox service(s).")
logging.info("This docker-compose file defines the remote browser services, one for each proxy.")
logging.info("----------------------------------------------------------")
# --- Generate camoufox_endpoints.json ---
endpoints_map = {}
for i, proxy in enumerate(camoufox_proxies):
proxy_port = _get_port_from_proxy_url(proxy['url'])
if proxy_port:
container_base_port = camoufox_port + i * worker_count
endpoints = []
for j in range(worker_count):
port = container_base_port + j
endpoints.append(f"ws://{camoufox_backend_prefix}{i+1}:{port}/mypath")
endpoints_map[proxy_port] = {
"ws_endpoints": endpoints
}
else:
logging.warning(f"Could not extract port from proxy URL: {proxy['url']}. Skipping for endpoint map.")
endpoints_data = {"endpoints": endpoints_map}
camoufox_dir = os.path.join(script_dir, 'camoufox')
endpoints_output_file = os.path.join(camoufox_dir, 'camoufox_endpoints.json')
with open(endpoints_output_file, 'w') as f:
json.dump(endpoints_data, f, indent=2)
logging.info(f"Successfully generated {endpoints_output_file} with {len(endpoints_map)} port-keyed endpoint(s).")
logging.info("This file maps each proxy to a list of WebSocket endpoints for Camoufox.")
logging.info("The token_generator uses this map to connect to the correct remote browser.")
else:
logging.info("Skipping Camoufox configuration generation for 'management' role.")
# --- Generate docker-compose-ytdlp-ops.yaml ---
ytdlp_ops_template = env.get_template('docker-compose-ytdlp-ops.yaml.j2')
ytdlp_ops_output_file = os.path.join(script_dir, 'docker-compose-ytdlp-ops.yaml')
# Combine all proxies (camoufox and general) into a single string for the server.
all_proxies = []
if expanded_camoufox_proxies_str:
all_proxies.extend([p.strip() for p in expanded_camoufox_proxies_str.split(',') if p.strip()])
general_proxies_str = os.getenv('GENERAL_PROXIES')
if general_proxies_str:
expanded_general_proxies_str = expand_env_vars(general_proxies_str)
logging.info(f"Expanded GENERAL_PROXIES from '{general_proxies_str}' to '{expanded_general_proxies_str}'")
general_proxies = [p.strip() for p in expanded_general_proxies_str.split(',') if p.strip()]
all_proxies.extend(general_proxies)
logging.info(f"Adding {len(general_proxies)} general purpose proxy/proxies.")
# Also check for the SOCKS5_SOCK_SERVER_IP for backward compatibility with docs
socks_server_ip = os.getenv('SOCKS5_SOCK_SERVER_IP', '172.17.0.1')
if socks_server_ip:
socks_server_port = os.getenv('SOCKS5_SOCK_SERVER_PORT', '1087')
general_proxy_url = f"socks5://{socks_server_ip}:{socks_server_port}"
if general_proxy_url not in all_proxies:
all_proxies.append(general_proxy_url)
logging.info(f"Adding general purpose proxy from SOCKS5_SOCK_SERVER_IP: {general_proxy_url}")
combined_proxies_str = ",".join(all_proxies)
logging.info(f"Combined proxy string for ytdlp-ops-service: '{combined_proxies_str}'")
ytdlp_ops_config_data = {
'combined_proxies_str': combined_proxies_str,
'service_role': service_role,
}
rendered_ytdlp_ops_config = ytdlp_ops_template.render(ytdlp_ops_config_data)
with open(ytdlp_ops_output_file, 'w') as f:
f.write(rendered_ytdlp_ops_config)
logging.info(f"Successfully generated {ytdlp_ops_output_file}")
# --- Envoy Configuration ---
envoy_port = int(os.getenv('ENVOY_PORT', 9080))
base_port = int(os.getenv('YTDLP_BASE_PORT', 9090))
envoy_admin_port = int(os.getenv('ENVOY_ADMIN_PORT', 9901))
# For local dev, ENVOY_BACKEND_ADDRESS is set to 127.0.0.1. For Docker, it's unset, so we default to the service name.
backend_address = os.getenv('ENVOY_BACKEND_ADDRESS', 'ytdlp-ops-service')
# Use STATIC for IP addresses, and STRICT_DNS for anything else (hostnames).
envoy_cluster_type = 'STATIC' if is_ip_address(backend_address) else 'STRICT_DNS'
# --- Generate envoy.yaml ---
envoy_template = env.get_template('envoy.yaml.j2')
envoy_output_file = os.path.join(script_dir, 'envoy.yaml')
logging.info("--- Generating Envoy Configuration ---")
logging.info(f"Envoy will listen on public port: {envoy_port}")
logging.info(f"It will load balance requests across {worker_count} internal gRPC endpoints of the 'ytdlp-ops-service'.")
logging.info(f"The backend service is located at: '{backend_address}' (type: {envoy_cluster_type})")
envoy_config_data = {
'envoy_port': envoy_port,
'worker_count': worker_count,
'base_port': base_port,
'envoy_admin_port': envoy_admin_port,
'backend_address': backend_address,
'envoy_cluster_type': envoy_cluster_type,
}
rendered_envoy_config = envoy_template.render(envoy_config_data)
with open(envoy_output_file, 'w') as f:
f.write(rendered_envoy_config)
logging.info(f"Successfully generated {envoy_output_file}")
logging.info("--- Configuration Generation Complete ---")
except Exception as e:
logging.error(f"Failed to generate configurations: {e}", exc_info=True)
exit(1)
if __name__ == '__main__':
generate_configs()