yt-dlp-dags/airflow/camoufox/camoufox_server.py

453 lines
20 KiB
Python

#!/usr/bin/env python3
import re
import argparse
import atexit
import shutil
import logging
import sys
import os
import psutil
import time
import threading
import signal
import asyncio
import websockets
from collections import deque, defaultdict
from datetime import datetime, timedelta
from camoufox.server import launch_server
# Global variables for resource tracking
active_connections = defaultdict(int) # Track connections per endpoint
max_connections = defaultdict(int)
resource_stats = {}
server_instances = {} # Track multiple server instances
shutdown_requested = False
endpoint_locks = defaultdict(threading.Lock) # Locks for each endpoint
memory_restart_threshold = 1800 # MB - warn when exceeded
restart_in_progress = False
# Enhanced monitoring metrics
connection_pool_metrics = {
'total_acquired': 0,
'total_released': 0,
'total_reused': 0,
'pool_size': 0,
'active_contexts': 0
}
def parse_proxy_url(url):
"""Parse proxy URL in format proto://user:pass@host:port"""
pattern = r'([^:]+)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)'
match = re.match(pattern, url)
if not match:
raise ValueError('Invalid proxy URL format. Expected proto://[user:pass@]host:port')
proto, username, password, host, port = match.groups()
# Ensure username and password are strings, not None
proxy_config = {
'server': f'{proto}://{host}:{port}',
'username': username or '',
'password': password or ''
}
# Remove empty credentials
if not proxy_config['username']:
del proxy_config['username']
if not proxy_config['password']:
del proxy_config['password']
return proxy_config
def monitor_resources(server_ports, proxy_url):
"""Monitor system resources and log warnings when thresholds are exceeded"""
global active_connections, max_connections, resource_stats, shutdown_requested, restart_in_progress
global connection_pool_metrics
logging.info(f"Resource monitor started for proxy '{proxy_url}' on ports {server_ports}")
log_counter = 0
while not shutdown_requested:
log_counter += 1
try:
# Get system resource usage
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
memory_percent = memory.percent
# Get current process info
current_process = psutil.Process()
process_memory = current_process.memory_info()
process_cpu = current_process.cpu_percent()
# Update active connections using psutil
all_connections = psutil.net_connections(kind='inet')
new_active_connections = defaultdict(int)
for conn in all_connections:
if conn.status == psutil.CONN_ESTABLISHED and conn.laddr.port in server_ports:
new_active_connections[conn.laddr.port] += 1
active_connections.clear()
active_connections.update(new_active_connections)
for port, count in active_connections.items():
max_connections[port] = max(max_connections.get(port, 0), count)
connection_pool_metrics['active_contexts'] = sum(active_connections.values())
# Update resource stats
resource_stats = {
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'process_memory_mb': process_memory.rss / 1024 / 1024,
'process_cpu_percent': process_cpu,
'total_active_connections': sum(active_connections.values()),
'active_connections_per_endpoint': dict(active_connections),
'max_connections': dict(max_connections),
'connection_pool_metrics': dict(connection_pool_metrics)
}
# Log resource usage periodically
if cpu_percent > 80 or memory_percent > 80:
logging.info(f"RESOURCE STATS - CPU: {cpu_percent}%, Memory: {memory_percent}%, "
f"Process Memory: {resource_stats['process_memory_mb']:.1f}MB, "
f"Total Active Connections: {resource_stats['total_active_connections']}")
# Log connection pool metrics
pool_metrics = resource_stats['connection_pool_metrics']
logging.info(f"POOL METRICS - Acquired: {pool_metrics['total_acquired']}, "
f"Released: {pool_metrics['total_released']}, "
f"Reused: {pool_metrics['total_reused']}, "
f"Pool Size: {pool_metrics['pool_size']}, "
f"Active Contexts: {pool_metrics['active_contexts']}")
# Warning thresholds
if cpu_percent > 85:
logging.warning(f"HIGH CPU USAGE: {cpu_percent}%")
if memory_percent > 85:
logging.warning(f"HIGH MEMORY USAGE: {memory_percent}%")
if resource_stats['total_active_connections'] > 100:
logging.warning(f"HIGH TOTAL CONNECTION COUNT: {resource_stats['total_active_connections']} active connections")
if process_memory.rss > 2 * 1024 * 1024 * 1024: # 2GB
logging.warning(f"HIGH PROCESS MEMORY: {process_memory.rss / 1024 / 1024:.1f}MB")
# Safety net: Warn instead of restart if memory exceeds threshold
if resource_stats['process_memory_mb'] > memory_restart_threshold:
logging.warning(f"MEMORY THRESHOLD EXCEEDED: {resource_stats['process_memory_mb']}MB > {memory_restart_threshold}MB")
logging.warning("Manual intervention required - memory usage critical but restart disabled")
logging.warning("Consider adding new camoufox instances or reducing concurrent workers")
# Add metric for monitoring instead of restart
logging.info(f"MEMORY_ALERT: {resource_stats['process_memory_mb']}MB used on {sum(active_connections.values())} active connections")
# Add a heartbeat log every minute (30s * 2)
if log_counter % 2 == 0:
logging.info(
f"HEARTBEAT - Proxy: {proxy_url} | Ports: {server_ports} | "
f"Memory: {resource_stats.get('process_memory_mb', 0):.1f}MB | "
f"CPU: {resource_stats.get('cpu_percent', 0)}% | "
f"Active Connections: {resource_stats.get('total_active_connections', 0)}"
)
except Exception as e:
logging.error(f"Error in resource monitoring: {e}")
time.sleep(30) # Check every 30 seconds
def graceful_shutdown(signum, frame):
"""Handle graceful shutdown"""
global shutdown_requested, server_instances, restart_in_progress
logging.info("Graceful shutdown requested")
shutdown_requested = True
# Log final resource stats
if resource_stats:
logging.info(f"Final resource stats: {resource_stats}")
# Log final connection pool metrics
logging.info(f"Final connection pool metrics: {connection_pool_metrics}")
# The server instances are running in daemon threads and will be terminated
# when the main process exits. No explicit shutdown call is needed.
logging.info("Shutting down all Camoufox server instances...")
# If restart was requested, exit with special code
if restart_in_progress:
logging.info("Restarting Camoufox server...")
os.execv(sys.executable, [sys.executable] + sys.argv)
sys.exit(0)
def create_server_instance(port, base_config):
"""
Creates and runs a new Camoufox server instance on the specified port.
NOTE: The `launch_server` function is a blocking call that runs an event loop
and does not return. Therefore, any code after it in this function is unreachable.
"""
config = base_config.copy()
config['port'] = port
try:
# This function blocks and runs the server indefinitely.
launch_server(**config)
except Exception as e:
# If an error occurs, log it. The daemon thread will then terminate.
logging.error(f'Error launching server on port {port}: {str(e)}', exc_info=True)
def check_listening_ports(expected_ports, log_results=True):
"""Checks which of the expected ports are actively listening."""
successful_ports = []
failed_ports = []
try:
# Check all system-wide connections, not just for the current process,
# as the server may run in a child process.
listening_ports = {
conn.laddr.port for conn in psutil.net_connections(kind='inet')
if conn.status == psutil.CONN_LISTEN
}
for port in expected_ports:
if port in listening_ports:
successful_ports.append(port)
else:
failed_ports.append(port)
if log_results:
logging.info("--- Verifying Listening Ports ---")
if successful_ports:
logging.info(f"Successfully listening on ports: {sorted(successful_ports)}")
if failed_ports:
logging.error(f"FAILED to listen on ports: {sorted(failed_ports)}")
logging.info("---------------------------------")
except Exception as e:
if log_results:
logging.error(f"Could not verify listening ports: {e}")
return successful_ports, failed_ports
def main():
parser = argparse.ArgumentParser(description='Launch Camoufox server with optional proxy support')
parser.add_argument('--proxy-url', help='Optional proxy URL in format proto://user:pass@host:port (supports http, https, socks5)')
parser.add_argument('--ws-host', default='0.0.0.0', help='WebSocket server host address (e.g., localhost, 0.0.0.0)')
parser.add_argument('--port', type=int, default=12345, help='Base WebSocket server port')
parser.add_argument('--num-instances', type=int, default=4, help='Number of server instances to create')
parser.add_argument('--port-range', type=str, help='Port range in format start-end (e.g., 12345-12349)')
parser.add_argument('--base-proxy-port', type=int, default=1080, help='Base proxy port for mapping to camoufox instances')
parser.add_argument('--ws-path', default='camoufox', help='Base WebSocket server path')
parser.add_argument('--headless', action='store_true', help='Run browser in headless mode')
parser.add_argument('--geoip', nargs='?', const=True, default=False,
help='Enable geo IP protection. Can specify IP address or use True for automatic detection')
parser.add_argument('--locale', help='Locale(s) to use (e.g. "en-US" or "en-US,fr-FR")')
parser.add_argument('--block-images', action='store_true', help='Block image requests to save bandwidth')
parser.add_argument('--block-webrtc', action='store_true', help='Block WebRTC entirely')
parser.add_argument('--humanize', nargs='?', const=True, type=float,
help='Humanize cursor movements. Can specify max duration in seconds')
parser.add_argument('--extensions', type=str,
help='Comma-separated list of extension paths to enable (XPI files or extracted directories). Use quotes if paths contain spaces.')
parser.add_argument('--persistent-context', action='store_true', help='Enable persistent browser context.')
parser.add_argument('--user-data-dir', type=str, help='Directory to store persistent browser data.')
parser.add_argument('--preferences', type=str, help='Comma-separated list of Firefox preferences (e.g. "key1=value1,key2=value2")')
# Add resource monitoring arguments
parser.add_argument('--monitor-resources', action='store_true', help='Enable resource monitoring')
parser.add_argument('--max-connections-per-instance', type=int, default=50, help='Maximum concurrent connections per instance')
parser.add_argument('--connection-timeout', type=int, default=300, help='Connection timeout in seconds')
parser.add_argument('--memory-restart-threshold', type=int, default=1800, help='Memory threshold (MB) to trigger warning')
args = parser.parse_args()
# Set memory restart threshold
global memory_restart_threshold
memory_restart_threshold = args.memory_restart_threshold
# Set up signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)
proxy_config = None
if args.proxy_url:
try:
proxy_config = parse_proxy_url(args.proxy_url)
print(f"Using proxy configuration: {args.proxy_url}")
except ValueError as e:
print(f'Error parsing proxy URL: {e}')
return
else:
print("No proxy URL provided. Running without proxy.")
# --- Basic Logging Configuration ---
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler = logging.StreamHandler(sys.stdout)
log_handler.setFormatter(log_formatter)
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
root_logger.addHandler(log_handler)
root_logger.setLevel(logging.DEBUG)
logging.debug("DEBUG logging enabled. Starting Camoufox server setup...")
# --- End Logging Configuration ---
try:
# --- Check DISPLAY environment variable ---
display_var = os.environ.get('DISPLAY')
logging.info(f"Value of DISPLAY environment variable: {display_var}")
# --- End Check ---
# Build base config dictionary
base_config = {
'headless': False, # Force non-headless mode for VNC
'geoip': True, # Always enable GeoIP when a proxy is used
'host': args.ws_host,
'ws_path': args.ws_path,
'env': {'DISPLAY': os.environ.get('DISPLAY')}
}
# Add proxy to config only if it was successfully parsed
if proxy_config:
base_config['proxy'] = proxy_config
# Add optional parameters
if args.locale:
base_config['locale'] = args.locale
if args.block_images:
base_config['block_images'] = True
if args.block_webrtc:
base_config['block_webrtc'] = True
if args.humanize:
base_config['humanize'] = args.humanize if isinstance(args.humanize, float) else True
# Add persistent context options
if args.persistent_context:
base_config['persistent_context'] = True
if args.user_data_dir:
base_config['user_data_dir'] = args.user_data_dir
# Add Firefox preferences
if args.preferences:
base_config['preferences'] = {}
prefs_list = args.preferences.split(',')
for pref in prefs_list:
if '=' in pref:
key, value = pref.split('=', 1)
if value.lower() in ('true', 'false'):
base_config['preferences'][key.strip()] = value.lower() == 'true'
elif value.isdigit():
base_config['preferences'][key.strip()] = int(value)
else:
base_config['preferences'][key.strip()] = value.strip()
print(f"Applied Firefox preferences: {base_config['preferences']}")
# Exclude default addons including uBlock Origin
base_config['exclude_addons'] = ['ublock_origin', 'default_addons']
print('Excluded default addons including uBlock Origin')
# Add custom extensions if specified
if args.extensions:
from pathlib import Path
valid_extensions = []
extensions_list = [ext.strip() for ext in args.extensions.split(',')]
temp_dirs_to_cleanup = []
def cleanup_temp_dirs():
for temp_dir in temp_dirs_to_cleanup:
try:
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary extension directory: {temp_dir}")
except Exception as e:
print(f"Warning: Failed to clean up temp dir {temp_dir}: {e}")
atexit.register(cleanup_temp_dirs)
for ext_path in extensions_list:
ext_path = Path(ext_path).absolute()
if not ext_path.exists():
print(f"Warning: Extension path does not exist: {ext_path}")
continue
if ext_path.is_file() and ext_path.suffix == '.xpi':
import tempfile
import zipfile
try:
temp_dir = tempfile.mkdtemp(prefix=f"camoufox_ext_{ext_path.stem}_")
temp_dirs_to_cleanup.append(temp_dir)
with zipfile.ZipFile(ext_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
valid_extensions.append(temp_dir)
print(f"Successfully loaded extension: {ext_path.name} (extracted to {temp_dir})")
except Exception as e:
print(f"Error loading extension {ext_path}: {str(e)}")
if temp_dir in temp_dirs_to_cleanup:
temp_dirs_to_cleanup.remove(temp_dir)
continue
elif ext_path.is_dir():
if (ext_path / 'manifest.json').exists():
valid_extensions.append(str(ext_path))
print(f"Successfully loaded extension: {ext_path.name}")
else:
print(f"Warning: Directory is not a valid Firefox extension: {ext_path}")
else:
print(f"Warning: Invalid extension path: {ext_path}")
if valid_extensions:
base_config['addons'] = valid_extensions
print(f"Loaded {len(valid_extensions)} extensions")
else:
print("Warning: No valid extensions were loaded")
# Create multiple server instances
ports_to_create = []
if args.port_range:
start_port, end_port = map(int, args.port_range.split('-'))
ports_to_create = list(range(start_port, end_port + 1))
else:
# Create instances starting from base port
ports_to_create = [args.port + i for i in range(args.num_instances)]
# Start resource monitoring thread if enabled, passing it the ports to watch.
if args.monitor_resources:
# Pass the proxy URL to the monitor for more descriptive logging
monitor_thread = threading.Thread(target=monitor_resources, args=(ports_to_create, args.proxy_url), daemon=True)
monitor_thread.start()
print(f"Attempting to launch {len(ports_to_create)} Camoufox server instances on ports: {ports_to_create}")
for port in ports_to_create:
# launch_server is blocking, so we run each instance in its own thread.
thread = threading.Thread(target=create_server_instance, args=(port, base_config), daemon=True)
thread.start()
# Add a small delay between launching instances to avoid race conditions
# in the underlying Playwright/Camoufox library.
time.sleep(1)
# The script's main purpose is now to launch the daemon threads and then wait.
# The actual readiness is determined by the start_camoufox.sh script.
print("Server threads launched. Main process will now wait for shutdown signal.")
# Log startup resource usage
process = psutil.Process()
memory_info = process.memory_info()
logging.info(f"Server started. Initial memory usage: {memory_info.rss / 1024 / 1024:.1f}MB")
# Keep the main thread alive to host the daemon threads and handle shutdown signals
try:
while not shutdown_requested:
time.sleep(1)
except KeyboardInterrupt:
logging.info("Received KeyboardInterrupt, shutting down...")
except Exception as e:
print(f'Error launching server: {str(e)}')
logging.error(f'Error launching server: {str(e)}', exc_info=True)
if 'Browser.setBrowserProxy' in str(e):
print('Note: The browser may not support SOCKS5 proxy authentication')
return
if __name__ == '__main__':
main()