Migrate to new dags and add proxy banning, sensor / worker in dags

2025-07-25 17:59:24 +03:00 · 2025-07-25 17:59:24 +03:00 · 61906a57ef
commit 61906a57ef
parent fc2d740b65
15 changed files with 1141 additions and 2667 deletions
--- a/camoufox/Dockerfile
+++ b/camoufox/Dockerfile
@ -1,24 +1,64 @@
-# Use a base Python image
+# Use ubuntu:22.04 as the base image
-FROM python:3.11-slim
+FROM ubuntu:22.04
 # Set working directory
 WORKDIR /app
-# Install necessary system packages for Playwright, GeoIP, and Xvfb
+# Set timezone and non-interactive frontend for apt
-RUN apt-get update && apt-get install -y --no-install-recommends \
+ARG DEBIAN_FRONTEND=noninteractive
-    libgeoip1 \
+ARG TZ=Europe/Minsk
-    # Xvfb for headless browser display
+ENV TZ=${TZ} LANG=C.UTF-8 LC_ALL=C.UTF-8
    xvfb \
    # Playwright browser dependencies
    libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
    && rm -rf /var/lib/apt/lists/*
-# Install Python dependencies: camoufox with geoip support and playwright==1.49
+# Install necessary system packages for Playwright, GeoIP, Xvfb, and VNC
-# Using --no-cache-dir to reduce image size
+RUN apt-get update && apt-get install -y --no-install-recommends \
-RUN pip install --no-cache-dir "camoufox[geoip]" playwright==1.49
+    # From user example
    vim lsof unzip wget ca-certificates \
    # From existing Dockerfile, kept for completeness
    libgeoip1 \
    dbus-x11 \
    xvfb \
    xserver-common \
    xauth \
    x11-xkb-utils \
    xfonts-base \
    procps \
    libgl1-mesa-dri \
    x11vnc \
    fluxbox \
    libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
    libgtk-3-0 libx11-xcb1 fonts-liberation tzdata \
    xauth util-linux  x11-xserver-utils \
    && \
    # Configure timezone
    ln -fs /usr/share/zoneinfo/${TZ} /etc/localtime && \
    dpkg-reconfigure -f noninteractive tzdata && \
    rm -rf /var/lib/apt/lists/*
 # Add build-time argument for VNC password
 ARG VNC_PASSWORD="vncpassword"
 # Set up VNC password from build argument
 RUN mkdir -p /root/.vnc && \
    x11vnc -storepasswd "${VNC_PASSWORD}" /root/.vnc/passwd
 # Install Miniconda
 RUN wget --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
    bash /tmp/miniconda.sh -b -p /opt/conda && \
    rm /tmp/miniconda.sh
 ENV PATH="/opt/conda/bin:$PATH"
 # Create conda environment and configure it
 RUN conda init bash && \
    conda config --set always_yes yes && \
    conda tos accept --override-channels --channel defaults && \
    conda create -n camo python=3.11 -y
 # Install Python dependencies in conda environment
 RUN conda run -n camo pip install --no-cache-dir "camoufox[geoip]" playwright==1.49
 # Install Playwright browsers for version 1.49
-RUN playwright install --with-deps
+RUN conda run -n camo playwright install --with-deps
 # Copy the server script into the image
 COPY camoufox_server.py .
@ -32,11 +72,38 @@ COPY youtube_ad_auto_skipper-0.6.0.xpi /app/extensions/
 # Expose the default port Camoufox might use (adjust if needed)
 # This is informational; the actual port mapping is in docker-compose.
 EXPOSE 12345
 # Expose VNC port
 EXPOSE 5900
 # Copy the wrapper script and make it executable
 COPY start_camoufox.sh /app/
-RUN chmod +x /app/start_camoufox.sh
+RUN chmod +x /app/start_camoufox.sh && \
    sed -i 's/\r$//' /app/start_camoufox.sh
-# Default command executes the wrapper script.
+# Configure Xvfb resolution via build arguments
-# Arguments for camoufox_server.py will be passed via docker-compose command section.
+ARG RESOLUTION="1920x1080x24"
-ENTRYPOINT ["/app/start_camoufox.sh"]
+ENV XVFB_RES="${RESOLUTION}" \
    DISPLAY=":99" \
    XAUTHORITY="/tmp/.Xauth"
 # Create Xauth setup (mcookie installed in previous apt-get)
 RUN touch /tmp/.Xauth && \
    chmod 644 /tmp/.Xauth && \
    echo "#!/bin/bash" > /init_x11.sh && \
    echo "xauth add \$DISPLAY . \$(mcookie)" >> /init_x11.sh && \
    echo "xhost +local:" >> /init_x11.sh && \
    chmod +x /init_x11.sh
 # Proper ENTRYPOINT using shell form
 #ENTRYPOINT ["/bin/bash", "-c", "source /init_x11.sh && exec xvfb-run --auto-servernum --server-args \"-screen 0 ${XVFB_RES} ${XVFB_ARGS}\" /app/start_camoufox.sh"]
 ENTRYPOINT ["/bin/bash", "-c", "\
    rm -f /tmp/.X99-lock && \
    Xvfb :99 -screen 0 ${XVFB_RES} -ac & \
    export DISPLAY=:99 && \
    sleep 1 && \
    touch /tmp/.Xauth && \
    xauth add :99 . $(mcookie) && \
    xhost +local: && \
    source /init_x11.sh && \
    exec /app/start_camoufox.sh \"$@\"", "camoufox-entrypoint"]
--- a/camoufox/start_camoufox.sh
+++ b/camoufox/start_camoufox.sh
@ -1,58 +1,39 @@
 #!/bin/bash
 # Set error handling
 set -e
-# Function to cleanup resources on exit
+# Global PIDs for cleanup
 VNC_PID=""
 FLUXBOX_PID=""
 # Cleanup function to terminate background processes on script exit
 cleanup() {
-  echo "Cleaning up resources..."
+    echo "Cleaning up background processes..."
-  
+    # Kill processes in reverse order of startup. The '|| true' prevents errors if a process is already dead.
-  # Kill Xvfb if it's running
+    if [ -n "$FLUXBOX_PID" ]; then kill -TERM $FLUXBOX_PID 2>/dev/null || true; fi
-  if [ -n "$XVFB_PID" ] && ps -p $XVFB_PID > /dev/null; then
+    if [ -n "$VNC_PID" ]; then kill -TERM $VNC_PID 2>/dev/null || true; fi
-    echo "Stopping Xvfb (PID: $XVFB_PID)"
+    echo "Cleanup complete."
    kill $XVFB_PID || true
  fi
  # Remove X lock files if they exist
  if [ -e "/tmp/.X99-lock" ]; then
    echo "Removing X lock file"
    rm -f /tmp/.X99-lock
  fi
  echo "Cleanup complete"
 }
 # Register the cleanup function to run on script exit
 trap cleanup EXIT
-# Check if X lock file exists and remove it (in case of previous unclean shutdown)
+# Xvfb is now started by xvfb-run in the Dockerfile ENTRYPOINT.
-if [ -e "/tmp/.X99-lock" ]; then
+# The DISPLAY variable will be set automatically by xvfb-run.
  echo "Removing existing X lock file"
  rm -f /tmp/.X99-lock
 fi
 # Start Xvfb with display :99
 echo "Starting Xvfb on display :99"
 Xvfb :99 -screen 0 1280x1024x24 -ac &
 XVFB_PID=$!
-# Wait a moment for Xvfb to initialize
+# It's safer to source conda.sh directly
-sleep 2
+source /opt/conda/etc/profile.d/conda.sh
 conda activate camo
-# Check if Xvfb started successfully
+# Start supporting services (VNC, window manager)
-if ! ps -p $XVFB_PID > /dev/null; then
+echo "Starting VNC server on port 5900..."
-  echo "Failed to start Xvfb"
+# The -noxdamage flag is added to improve compatibility with VNC clients like the one on macOS.
-  exit 1
+# The '-localhost no' part was likely a typo and has been removed as the default is to allow non-localhost connections.
-fi
+x11vnc -forever -usepw -display $DISPLAY -rfbport 5900 -o /var/log/x11vnc.log -shared -noxdamage &
 VNC_PID=$!
-# Export the DISPLAY variable for the browser
+echo "Starting Fluxbox window manager..."
-export DISPLAY=:99
+fluxbox > /var/log/fluxbox.log 2>&1 &
 FLUXBOX_PID=$!
-echo "Xvfb started successfully with PID: $XVFB_PID"
+# Start main application
-echo "DISPLAY set to: $DISPLAY"
+echo "Starting Camoufox server with arguments: $@"
-
+exec python3 camoufox_server.py "$@"
 # Start the Camoufox server with all arguments passed to this script  
 echo "Starting Camoufox server with arguments:"                       
 printf "  Arg: '%s'\n" "$@" # Print each argument quoted on a new line
 echo "Executing: python3 camoufox_server.py $@"                       
 python3 camoufox_server.py "$@"  
--- a/dags/README.ru.md
+++ b/dags/README.ru.md
@ -0,0 +1,46 @@
 # Архитектура и описание YTDLP Airflow DAGs
 Этот документ описывает архитектуру и назначение DAG'ов, используемых для скачивания видео с YouTube. Система построена по паттерну "Сенсор/Воркер" для обеспечения непрерывной и параллельной обработки.
 ## Основной цикл обработки
 ### `ytdlp_sensor_redis_queue` (Сенсор)
 - **Назначение:** Забирает URL на скачивание из очереди Redis и запускает воркеры для их обработки.
 - **Принцип работы (Запуск по триггеру):**
    - **По триггеру:** Когда воркер `ytdlp_worker_per_url` успешно завершает работу, он немедленно запускает сенсор. Это обеспечивает непрерывную обработку без задержек. Запуск по расписанию отключен, чтобы избежать повторного запуска задач для заблокированных аккаунтов.
    - **Логика:** Извлекает из Redis (`_inbox` лист) пачку URL. Если очередь пуста, DAG успешно завершается до следующего запуска по триггеру.
 ### `ytdlp_worker_per_url` (Воркер)
 - **Назначение:** Обрабатывает один URL, скачивает видео и продолжает цикл.
 - **Принцип работы:**
    - Получает один URL от сенсора.
    - Обращается к сервису `ytdlp-ops-auth` для получения `info.json` и `socks5` прокси.
    - Скачивает видео, используя полученные данные. (TODO: заменить вызов `yt-dlp` как команды на вызов библиотеки).
    - В зависимости от статуса (успех/неуспех), помещает результат в соответствующий хэш Redis (`_result` или `_fail`).
    - В случае успеха, повторно запускает сенсор `ytdlp_sensor_redis_queue` для продолжения цикла обработки. В случае ошибки цикл останавливается для ручной диагностики.
 ## Управляющие DAG'и
 Эти DAG'и предназначены для ручного управления очередями и не участвуют в автоматическом цикле.
 - **`ytdlp_mgmt_queue_add_and_verify`**: Добавление URL в очередь задач (`_inbox`) и последующая проверка статуса этой очереди.
 - **`ytdlp_mgmt_queues_check_status`**: Просмотр состояния и содержимого всех ключевых очередей (`_inbox`, `_progress`, `_result`, `_fail`). Помогает отслеживать процесс обработки.
 - **`ytdlp_mgmt_queue_clear`**: Очистка (полное удаление) указанной очереди Redis. **Использовать с осторожностью**, так как операция необратима.
 ## Внешние сервисы
 ### `ytdlp-ops-auth` (Thrift Service)
 - **Назначение:** Внешний сервис, который предоставляет аутентификационные данные (токены, cookies, proxy) для скачивания видео.
 - **Взаимодействие:** Worker DAG (`ytdlp_worker_per_url`) обращается к этому сервису перед началом загрузки для получения необходимых данных для `yt-dlp`.
 ## TODO (Планы на доработку)
 - **Реализовать механизм "Circuit Breaker" (автоматического выключателя):**
  - **Проблема:** Если воркер падает с ошибкой (например, из-за бана аккаунта), сенсор, запускаемый по расписанию, продолжает создавать новые задачи для этого же аккаунта, усугубляя проблему.
  - **Решение:**
    1. **Воркер (`ytdlp_worker_per_url`):** При сбое задачи, воркер должен устанавливать в Redis флаг временной блокировки для своего `account_id` (например, на 5-10 минут).
    2. **Сенсор (`ytdlp_sensor_redis_queue`):** Перед проверкой очереди, сенсор должен проверять наличие флага блокировки для своего `account_id`. Если аккаунт заблокирован, сенсор должен пропустить выполнение, предотвращая запуск новых воркеров для проблемного аккаунта.
  - **Результат:** Это предотвратит многократные повторные запросы к заблокированному аккаунту и даст системе время на восстановление.
--- a/dags/ytdlp_client_dag_v2.1.py
+++ b/dags/ytdlp_client_dag_v2.1.py
@ -1,941 +0,0 @@
 from airflow import DAG
 from airflow.models import BaseOperator, Variable
 from airflow.utils.decorators import apply_defaults
 from airflow.hooks.base import BaseHook
 from airflow.exceptions import AirflowException
 from airflow.utils.dates import days_ago
 from thrift.transport import TSocket, TTransport
 from thrift.protocol import TBinaryProtocol
 from thrift.transport.TTransport import TTransportException
 from datetime import datetime, timedelta
 from pangramia.yt.exceptions.ttypes import PBServiceException
 import redis
 import logging
 import time
 import socket
 import json
 import os
 from pangramia.yt.tokens_ops import YTTokenOpService
 from pangramia.yt.common.ttypes import TokenUpdateMode
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.operators.python import PythonOperator
 from airflow.models.param import Param
 # Assuming ytdlp_utils exists in the same directory or PYTHONPATH
 # from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings (similar to ytdlp_client_dag.py)
 MAX_RETRIES = 1
 RETRY_DELAY = timedelta(seconds=10)
 DEFAULT_TIMEOUT = 30
 class YtdlpOpsOperator(BaseOperator):
    """
    Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
    and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
    """
    template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
    @apply_defaults
    def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
                 service_ip=None, service_port=None, redis_enabled=False, account_id=None,
                 save_info_json=True, info_json_dir=None, get_socks_proxy=True,
                 store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
                    f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
                    f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
                    f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
                    f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
        # Validate required parameters
        if not url:
            raise ValueError("url is required")
        # Validate parameters based on connection mode
        if redis_enabled:
            if not account_id:
                raise ValueError("account_id is required when redis_enabled=True")
            # Use default Redis connection if not specified
            if not redis_conn_id:
                redis_conn_id = 'redis_default'
                logger.info(f"Using default Redis connection ID: {redis_conn_id}")
        else:
            if not service_ip or not service_port:
                raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
            if not account_id:
                logger.warning("No account_id provided for direct connection mode. Using 'default'")
                account_id = 'default' # Assign default if missing in direct mode
        self.url = url
        self.redis_conn_id = redis_conn_id
        self.max_retries = max_retries
        self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
        self.service_ip = service_ip
        self.service_port = service_port
        self.redis_enabled = redis_enabled
        self.account_id = account_id
        self.save_info_json = save_info_json
        self.info_json_dir = info_json_dir
        self.get_socks_proxy = get_socks_proxy
        self.store_socks_proxy = store_socks_proxy
        self.timeout = timeout
    def execute(self, context):
        logger.info("Executing YtdlpOpsOperator")
        transport = None
        try:
            logger.info("Getting task parameters")
            params = context.get('params', {})
            redis_enabled = params.get('redis_enabled', self.redis_enabled)
            logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
            # Determine account_id to use (from params or operator default)
            account_id = context['params'].get('account_id', self.account_id)
            logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
            if redis_enabled:
                # Get Redis connection with proper authentication and error handling
                redis_conn = BaseHook.get_connection(self.redis_conn_id)
                redis_client = redis.Redis(
                    host=redis_conn.host,
                    port=redis_conn.port,
                    password=redis_conn.password,
                    db=0,
                    decode_responses=True # Important for consistent key handling
                )
                # Test Redis connection
                try:
                    if not redis_client.ping():
                        raise redis.exceptions.ConnectionError("Redis ping failed")
                    logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
                except redis.exceptions.AuthenticationError:
                    logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
                    raise AirflowException("Redis authentication failed.")
                except redis.exceptions.ConnectionError as e:
                    logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
                    raise AirflowException(f"Redis connection failed: {e}")
                except Exception as e:
                    logger.error(f"Unexpected Redis error: {str(e)}")
                    raise AirflowException(f"Unexpected Redis error: {e}")
                # Get service details from Redis with retries and proper key handling
                service_key = f"ytdlp:{account_id}"
                legacy_key = account_id # For backward compatibility
                host = None
                port = None
                for attempt in range(self.max_retries):
                    try:
                        logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
                        service_details = redis_client.hgetall(service_key)
                        if not service_details:
                            logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
                            service_details = redis_client.hgetall(legacy_key)
                        if not service_details:
                            raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
                        # Find IP and port, handling potential case differences and byte/string types
                        ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
                        port_key = next((k for k in service_details if k.lower() == 'port'), None)
                        if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
                        if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
                        host = service_details[ip_key] # Already decoded due to decode_responses=True
                        port_str = service_details[port_key]
                        try:
                            port = int(port_str)
                        except ValueError:
                            raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
                        logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
                        break # Success
                    except Exception as e:
                        logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
                        if attempt == self.max_retries - 1:
                            logger.error("Max retries reached for fetching Redis details.")
                            raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
                        logger.info(f"Retrying in {self.retry_delay} seconds...")
                        time.sleep(self.retry_delay)
            else:
                # Direct connection: Log parameter sources
                params = context.get('params', {})
                host = params.get('service_ip', self.service_ip)
                host_source = 'task params' if 'service_ip' in params else 'operator init'
                port_str = params.get('service_port', self.service_port)
                port_source = 'task params' if 'service_port' in params else 'operator init'
                url = params.get('url', self.url)
                url_source = 'task params' if 'url' in params else 'operator init'
                logger.info(f"Using service_ip={host} (from {host_source})")
                logger.info(f"Using service_port={port_str} (from {port_source})")
                logger.info(f"Using url={url} (from {url_source})")
                if not host or not port_str:
                    raise ValueError("Direct connection requires service_ip and service_port")
                try:
                    port = int(port_str)
                except ValueError:
                     raise ValueError(f"Invalid service_port value: {port_str}")
                logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
            # Render and validate timeout
            timeout_param = context.get('params', {}).get('timeout', self.timeout)
            if isinstance(self.timeout, str) and '{{' in self.timeout:
                timeout_rendered = self.render_template(self.timeout, context)
                logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
                timeout_param = timeout_rendered
            try:
                timeout = int(timeout_param)
                if timeout <= 0: raise ValueError("Timeout must be positive")
                logger.info(f"Using timeout: {timeout} seconds")
            except (ValueError, TypeError):
                logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
                timeout = DEFAULT_TIMEOUT
            # Create Thrift connection objects
            socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
            socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
            transport = TTransport.TFramedTransport(socket_conn)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)
            client = YTTokenOpService.Client(protocol)
            logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
            try:
                transport.open()
                logger.info("Successfully connected to Thrift server.")
                # Test connection with ping
                try:
                    client.ping()
                    logger.info("Server ping successful.")
                except Exception as e:
                    logger.error(f"Server ping failed: {e}")
                    raise AirflowException(f"Server connection test (ping) failed: {e}")
                # Get token from service with specific error handling
                try:
                    url_param = context.get('params', {}).get('url', self.url)
                    logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
                    token_data = client.getOrRefreshToken(
                        accountId=account_id,
                        updateType=TokenUpdateMode.AUTO,
                        url=url_param
                    )
                    logger.info("Successfully retrieved token data from service.")
                except PBServiceException as e:
                    logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
                    error_code = getattr(e, 'errorCode', None)
                    error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
                    # Handle specific known error codes
                    if error_code in [
                        "SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
                        "SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
                        "SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
                    ]:
                        error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
                    elif error_code == "BOT_DETECTION":
                        error_msg = f"Bot detection triggered ({error_code}): {e.message}."
                        suggestions = getattr(e, 'context', {}).get('suggestions', [])
                        if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
                    elif error_code == "NODEJS_SCRIPT_ERROR":
                        error_msg = f"Node.js script error ({error_code}): {e.message}."
                    elif error_code == "NODEJS_TIMEOUT":
                        error_msg = f"Node.js timeout ({error_code}): {e.message}."
                    # Add more specific error handling as needed
                    raise AirflowException(error_msg)
                except TTransportException as e:
                    logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
                    raise AirflowException(f"Transport error during API call: {e}")
                except Exception as e:
                    logger.error(f"Unexpected error during getOrRefreshToken: {e}")
                    raise AirflowException(f"Unexpected error during API call: {e}")
            except TTransportException as e:
                # Handle connection-specific transport errors
                if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
                    logger.error(f"Connection failed to {host}:{port}. Details: {e}")
                    logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
                    raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
                else:
                    logger.error(f"Thrift transport error during connection: {str(e)}")
                    raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
            except Exception as e:
                logger.error(f"Unexpected error during connection or ping: {str(e)}")
                raise # Re-raise other unexpected errors
            # Log received token data attributes for debugging
            logger.debug(f"Token data received. Attributes: {dir(token_data)}")
            for attr in dir(token_data):
                if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
                    value = getattr(token_data, attr)
                    if attr == 'infoJson' and value:
                        logger.debug(f"infoJson: {value[:50]}...")
                    else:
                        logger.debug(f"{attr}: {value}")
            info_json_path = None # Initialize info_json_path
            save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
            # Render if it's a string template
            if isinstance(save_info_json_param, str):
                 save_info_json_rendered = self.render_template(save_info_json_param, context)
                 # Convert common string representations to boolean
                 save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
            else:
                 save_info_json = bool(save_info_json_param)
            # Save info.json if requested and valid
            if self.save_info_json:
                info_json = self._get_info_json(token_data)
                if info_json and self._is_valid_json(info_json):
                    try:
                        # Use internal _save_info_json method which handles rendering, dir creation, logging
                        info_json_path = self._save_info_json(context, info_json)
                        if info_json_path: # Check if saving was successful
                            context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
                            logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
                        else:
                            # _save_info_json should log errors, push None to indicate failure
                            context['task_instance'].xcom_push(key='info_json_path', value=None)
                            logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
                    except Exception as e:
                        logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
                        context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
                elif info_json:
                    logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
                    context['task_instance'].xcom_push(key='info_json_path', value=None)
                else:
                    logger.info("No infoJson found in token data. Skipping save.")
                    context['task_instance'].xcom_push(key='info_json_path', value=None)
            else:
                logger.info("save_info_json is False. Skipping info.json save.")
                context['task_instance'].xcom_push(key='info_json_path', value=None)
            # Extract and potentially store SOCKS proxy
            socks_proxy = None
            if self.get_socks_proxy: # Use instance attribute
                # Check for common attribute names for proxy
                proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
                if proxy_attr:
                    socks_proxy = getattr(token_data, proxy_attr)
                    if socks_proxy: # Ensure proxy value is not empty
                         logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
                         if self.store_socks_proxy: # Use instance attribute
                             context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
                             logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
                         else:
                             logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
                    else:
                        logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
                        # Push None even if found but empty, if storing is enabled
                        if self.store_socks_proxy: # Use instance attribute
                            context['task_instance'].xcom_push(key='socks_proxy', value=None)
                            logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
                else:
                    logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
                    # Push None if storing is enabled but attribute not found
                    if self.store_socks_proxy: # Use instance attribute
                        context['task_instance'].xcom_push(key='socks_proxy', value=None)
                        logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
            else:
                logger.info("get_socks_proxy is False. Skipping proxy extraction.")
                # Push None if storing is enabled but extraction was skipped
                if self.store_socks_proxy: # Use instance attribute
                    context['task_instance'].xcom_push(key='socks_proxy', value=None)
                    logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
            # Get the original command from the server
            ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
            if not ytdlp_cmd:
                logger.error("No 'ytdlpCommand' attribute found in token data.")
                raise AirflowException("Required 'ytdlpCommand' not received from service.")
            logger.info(f"Original command received from server: {ytdlp_cmd}")
            # Log example usage command (DO NOT MODIFY the original command here)
            if info_json_path:
                # Use double quotes for paths/proxy in example for robustness
                example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
                if socks_proxy:
                    example_cmd += f" --proxy \"{socks_proxy}\""
                example_cmd += " --verbose --simulate" # Add useful flags for testing
                logger.info(f"\n--- Example usage with saved info.json ---")
                logger.info(example_cmd)
                logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
                latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
                logger.info(f"(You can also use 'latest.json': {latest_json_path})")
                logger.info(f"-------------------------------------------\n")
            else:
                logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
                if socks_proxy:
                     logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
                logger.info("Add --verbose and --simulate flags for testing the command.")
                logger.info(f"-------------------------------------------------------\n")
            # Push the *original* command to XCom
            context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
            logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
            # Note: Returning ytdlp_cmd below implicitly pushes the same value
            # to XCom under the key 'return_value'. Downstream tasks should
            # preferably use the explicitly pushed 'ytdlp_command' key for clarity.
            return ytdlp_cmd # Return the original command
        except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
             logger.error(f"Operation failed due to AirflowException: {e}")
             raise # Re-raise AirflowExceptions to ensure task failure
        except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
            logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
            raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
        except Exception as e: # General catch-all for truly unexpected errors
            # Log with traceback for unexpected errors
            logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
            # Ensure any unexpected error explicitly fails the task with AirflowException
            raise AirflowException(f"Unexpected error caused task failure: {e}")
        finally:
            if transport and transport.isOpen(): # Check if transport exists and is open before closing
                logger.info("Closing Thrift transport.")
                transport.close()
    # --- Helper Methods ---
    def _get_info_json(self, token_data):
        """Safely extracts infoJson from token data."""
        info_json = getattr(token_data, 'infoJson', None)
        if info_json:
            logger.debug("Extracted infoJson from token data.")
        else:
            logger.debug("No infoJson attribute found in token data.")
        return info_json
    def _is_valid_json(self, json_str):
        """Checks if a string is valid JSON."""
        if not json_str or not isinstance(json_str, str):
            logger.debug("Input is not a non-empty string, considered invalid JSON.")
            return False
        try:
            json.loads(json_str)
            logger.debug("JSON string validation successful.")
            return True
        except json.JSONDecodeError as e:
            logger.warning(f"JSON validation failed: {e}")
            return False
    def _save_info_json(self, context, info_json):
        """Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
        try:
            # Get URL from params/context for video ID extraction
            url_param = context.get('params', {}).get('url', self.url)
            video_id = self._extract_video_id(url_param) # Use internal helper
            # Render the info_json_dir template
            save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
            save_dir = self.render_template(save_dir_template, context)
            if not save_dir: # Handle case where template renders to empty string
                logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
                save_dir = "."
            logger.info(f"Target directory for info.json (rendered): {save_dir}")
            # Ensure directory exists
            try:
                os.makedirs(save_dir, exist_ok=True)
                logger.info(f"Ensured directory exists: {save_dir}")
            except OSError as e:
                logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
                return None # Indicate failure
            # Construct filename (using potentially overridden account_id)
            account_id_param = context.get('params', {}).get('account_id', self.account_id)
            timestamp = int(time.time())
            base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
            info_json_path = os.path.join(save_dir, base_filename)
            latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
            # Write to timestamped file
            try:
                logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
                with open(info_json_path, 'w', encoding='utf-8') as f:
                    f.write(info_json)
                logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
            except IOError as e:
                 logger.error(f"Failed to write info.json to {info_json_path}: {e}")
                 return None # Indicate failure
            # Write to latest.json (overwrite) - best effort
            try:
                with open(latest_json_path, 'w', encoding='utf-8') as f:
                    f.write(info_json)
                logger.info(f"Updated latest.json file: {latest_json_path}")
            except IOError as e:
                # Log warning but don't fail the whole save if only latest.json fails
                logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
            return info_json_path # Return path on success (even if latest.json failed)
        except Exception as e:
            logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
            return None # Indicate failure
    def _extract_video_id(self, url):
        """Extracts YouTube video ID from URL (internal helper)."""
        if not url or not isinstance(url, str):
            logger.debug("URL is empty or not a string, cannot extract video ID.")
            return None
        try:
            # Basic extraction logic (can be enhanced for more URL types)
            video_id = None
            if 'youtube.com/watch?v=' in url:
                video_id = url.split('v=')[1].split('&')[0]
            elif 'youtu.be/' in url:
                video_id = url.split('youtu.be/')[1].split('?')[0]
            # Ensure it looks like a video ID (typically 11 chars, but can vary)
            if video_id and len(video_id) >= 11:
                 video_id = video_id[:11] # Take first 11 chars as standard ID length
                 logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
                 return video_id
            else:
                 logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
                 return None
        except Exception as e:
            logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
            return None
 # =============================================================================
 # Python Callables for Tasks
 # =============================================================================
 def display_token_info(**context):
    """Displays token info from XCom, parses info.json, and logs example commands."""
    ti = context['task_instance']
    logger.info("Starting display_token_info task.")
    # Pull data from XCom (provide default values)
    info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
    socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
    ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
    logger.info("\n=== Pulled Token Information from XCom ===")
    logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
    logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
    logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
    result = {
        'info_path': info_json_path,
        'proxy': socks_proxy,
        'ytdlp_command': ytdlp_command,
        'video_info': None,
        'commands': {},
        'error': None
    }
    if info_json_path and os.path.exists(info_json_path):
        logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
        try:
            with open(info_json_path, 'r', encoding='utf-8') as f:
                info = json.load(f)
            # Extract and log basic video info safely
            title = info.get('title', 'Unknown Title')
            uploader = info.get('uploader', 'Unknown Author')
            duration = info.get('duration_string', 'Unknown Length')
            upload_date_str = info.get('upload_date') # Format: YYYYMMDD
            upload_date_formatted = 'Unknown Date'
            if upload_date_str:
                try:
                    # Validate format before parsing
                    if len(upload_date_str) == 8 and upload_date_str.isdigit():
                        upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
                    else:
                        logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
                except ValueError:
                    logger.warning(f"Could not parse upload_date '{upload_date_str}'")
            result['video_info'] = {
                'title': title,
                'uploader': uploader,
                'upload_date': upload_date_formatted, # Store formatted date
                'duration': duration
            }
            logger.info(f"Title: {title}")
            logger.info(f"Author: {uploader}")
            logger.info(f"Date: {upload_date_formatted}")
            logger.info(f"Length: {duration}")
            logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
            base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
            if socks_proxy:
                base_cmd += f" --proxy \"{socks_proxy}\""
            # Command to list formats
            format_cmd = f"{base_cmd} -F"
            result['commands']['format'] = format_cmd
            logger.info(f"List formats command: {format_cmd}")
            # Execute and log the format listing command
            logger.info("\n--- Executing Format List Command ---")
            try:
                # Use os.popen for simplicity, capture output
                logger.info(f"Running: {format_cmd}")
                format_output = os.popen(format_cmd).read()
                logger.info("--- Format List Output ---")
                logger.info(format_output)
                logger.info("--------------------------")
            except Exception as e:
                logger.error(f"Error executing format command: {e}")
            # Command to simulate download
            simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
            result['commands']['simulate'] = simulate_cmd
            logger.info(f"Simulate download command: {simulate_cmd}")
            # Execute and log the simulation command
            logger.info("\n--- Executing Simulation Command ---")
            try:
                logger.info(f"Running: {simulate_cmd}")
                simulate_output = os.popen(simulate_cmd).read()
                logger.info("--- Simulation Output ---")
                logger.info(simulate_output)
                logger.info("-------------------------")
            except Exception as e:
                 logger.error(f"Error executing simulation command: {e}")
            # Basic download command
            download_cmd = base_cmd
            result['commands']['download_base'] = download_cmd
            logger.info(f"Base download command (add format selection, output path): {download_cmd}")
            # Push generated example commands to XCom for potential downstream use
            # ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
            # ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
            ti.xcom_push(key='download_cmd', value=download_cmd)
            logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
        except json.JSONDecodeError as e:
            error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
            logger.error(error_msg)
            result['error'] = error_msg
        except FileNotFoundError:
            error_msg = f"Info.json file not found at path: {info_json_path}"
            logger.error(error_msg)
            result['error'] = error_msg
        except Exception as e:
            error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
            logger.error(error_msg, exc_info=True)
            result['error'] = error_msg
    elif info_json_path:
        error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
        logger.warning(error_msg)
        result['error'] = error_msg
    else:
        logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
        result['error'] = "Info.json path not available."
    logger.info("Finished display_token_info task.")
    # Return the collected information (useful if used as a PythonOperator return value)
    return json.dumps(result) # Return as JSON string for XCom compatibility if needed
 def store_token_info(**context):
    """Stores retrieved token information (command, proxy, info.json) in Redis."""
    ti = context['task_instance']
    # Use the redis_conn_id defined in the operator/DAG params if possible, else default
    redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
    redis_hook = RedisHook(redis_conn_id=redis_conn_id)
    logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
    try:
        # Pull necessary data from XCom and context
        url = context['params'].get('url')
        if not url:
            # Attempt to get URL from DAG run conf as fallback
            url = context.get('dag_run', {}).conf.get('url')
            if not url:
                 raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
            logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
        ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
        socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
        info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
        if not ytdlp_command:
            logger.warning("ytdlp_command not found in XCom. Storing empty value.")
            ytdlp_command = '' # Store empty if not found
        # Construct the base command using info.json
        ytdlp_command_base = ''
        if info_json_path and os.path.exists(info_json_path):
            ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
            logger.info(f"Constructed base command: {ytdlp_command_base}")
        else:
            logger.warning("Cannot construct base command: info_json_path not valid.")
        # Construct the command with tokens and proxy
        ytdlp_command_tokens = ytdlp_command # Start with original command from server
        if socks_proxy:
            ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
            logger.info("Appended proxy to token command.")
        data_to_store = {
            'url': url,
            'ytdlp_command': ytdlp_command_base, # Store the base command
            'proxy': socks_proxy,
            'info_json_path': info_json_path or '' # Store path even if None/empty
            # 'info_json' will be added below
        }
        # Read info.json content if path exists
        info_json_content = None
        if info_json_path and os.path.exists(info_json_path):
            try:
                with open(info_json_path, 'r', encoding='utf-8') as f:
                    # Read and immediately validate JSON structure before storing
                    info_json_content = json.load(f)
                # Store the validated JSON as a string
                data_to_store['info_json'] = json.dumps(info_json_content)
                logger.info(f"Read and validated info.json content from: {info_json_path}")
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
                data_to_store['info_json'] = '' # Store empty string on parse error
            except Exception as e:
                logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
                data_to_store['info_json'] = '' # Store empty string on other read errors
        else:
            logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
            data_to_store['info_json'] = '' # Store empty string if no path
        # Determine Redis key using video ID
        # Use the same helper method as the operator for consistency
        # Need an instance or static method call. Let's make _extract_video_id static temporarily
        # Or instantiate the operator just for this - less ideal.
        # Simplest: Re-implement or assume utils.
        # Re-implementing basic logic here for simplicity:
        video_id = None
        try:
            if 'youtube.com/watch?v=' in url:
                video_id = url.split('v=')[1].split('&')[0][:11]
            elif 'youtu.be/' in url:
                video_id = url.split('youtu.be/')[1].split('?')[0][:11]
        except Exception:
            pass # Ignore errors in ID extraction for key generation
        redis_key = f"token_info:{video_id or 'unknown'}"
        logger.info(f"Determined Redis key: {redis_key}")
        # Store data in Redis hash
        # Log presence/absence rather than full content for potentially large fields
        logger.info(f"Data to store in Redis key '{redis_key}': "
                    f"URL='{data_to_store['url']}', "
                    f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
                    f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
                    f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
                    f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
        with redis_hook.get_conn() as redis_client:
            # Extract video ID from URL
            video_id = None
            try:
                if 'youtube.com/watch?v=' in url:
                    video_id = url.split('v=')[1].split('&')[0][:11]
                elif 'youtu.be/' in url:
                    video_id = url.split('youtu.be/')[1].split('?')[0][:11]
            except Exception:
                pass  # Ignore errors in ID extraction for key generation
            # Use video ID as part of the Redis key
            redis_key = f"token_info:{video_id or 'unknown'}"
            logger.info(f"Determined Redis key: {redis_key}")
            # Store data in Redis hash
            # Add video_id, timestamp, and the constructed ytdlp_command_tokens
            data_to_store['video_id'] = video_id or 'unknown'
            data_to_store['timestamp'] = int(time.time())
            data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
            # Log fields being stored
            log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
            logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
            redis_client.hset(redis_key, mapping=data_to_store)
            # Set expiration (e.g., 24 hours = 86400 seconds)
            redis_client.expire(redis_key, 86400)
            logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
            # Log the final stored data again for clarity
            final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
            logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
            logger.info(final_log_data)
            logger.info("----------------------------------------------------")
    except Exception as e:
        logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
        # Re-raise as AirflowException to fail the task
        raise AirflowException(f"Failed to store token info in Redis: {e}")
    logger.info("Finished store_token_info task.")
 # =============================================================================
 # DAG Definition
 # =============================================================================
 # Update default_args to match ytdlp_client_dag.py structure
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False, # Match reference DAG
    'email_on_retry': False,   # Match reference DAG
    'retries': 1,              # Default task retries
    'retry_delay': timedelta(minutes=5), # Standard task retry delay
    'start_date': days_ago(1)  # Best practice start date
 }
 # Update DAG definition
 with DAG(
    dag_id='ytdlp_client_dag_v2.1',
    default_args=default_args,
    schedule_interval=None,     # Manually triggered DAG
    catchup=False,              # Don't run for past missed schedules
    description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
    tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
    params={
        # Define DAG parameters with defaults and types for UI clarity
        'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
        'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
        'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
        'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
        'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
        'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
        # Use Airflow Variable for downloads directory, matching reference DAG structure
        'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
    }
 ) as dag:
    # Define Tasks
    get_token = YtdlpOpsOperator(
        task_id='get_token',
        # Pass templated parameters from DAG run config
        url="{{ params.url }}",
        redis_enabled="{{ params.redis_enabled }}",
        service_ip="{{ params.service_ip }}",
        service_port="{{ params.service_port }}",
        account_id="{{ params.account_id }}",
        save_info_json=True,
        info_json_dir="{{ params.info_json_dir }}",
        get_socks_proxy=True,
        store_socks_proxy=True,
        timeout="{{ params.timeout }}",
        retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
        retry_delay=RETRY_DELAY, # Operator-specific delay if needed
        # Add callbacks for logging success/failure, similar to reference DAG
        on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
        on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
    )
    # Add task documentation (visible in Airflow UI)
    get_token.doc_md = """
    ### Get Token Task
    Connects to the YTDLP Thrift service (either directly or via Redis discovery)
    to retrieve an authentication token and video metadata (info.json).
    **Pushes to XCom:**
    - `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
    - `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
    - `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
    - Uses parameters defined in the DAG run configuration.
    """
    # Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
    def _check_xcom_callable(**context):
        """Logs XCom values pushed by the get_token task."""
        ti = context['task_instance']
        logger.info("--- Checking XCom values pushed by get_token ---")
        keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
        xcom_values = {}
        for key in keys_to_check:
            value = ti.xcom_pull(task_ids='get_token', key=key)
            xcom_values[key] = value
            # Avoid logging potentially sensitive command details fully in production
            if key == 'ytdlp_command' and value:
                 log_value = f"{value[:50]}..." # Log truncated command
            else:
                log_value = value
            logger.info(f"XCom key='{key}': {log_value}")
        logger.info("----------------------------------------------")
        return xcom_values # Return values for potential future use
    check_xcom_task = PythonOperator(
        task_id='check_xcom_after_get_token',
        python_callable=_check_xcom_callable,
    )
    check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
    display_info = PythonOperator(
        task_id='display_token_info',
        python_callable=display_token_info,
        trigger_rule='all_success'
    )
    display_info.doc_md = """
    ### Display Token Info Task
    Pulls information from XCom, parses the `info.json` file (if available),
    logs video details, and generates example `yt-dlp` commands.
    **Pulls from XCom (task_id='get_token'):**
    - `info_json_path`
    - `socks_proxy`
    - `ytdlp_command`
    **Pushes to XCom:**
    - `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
    """
    store_info = PythonOperator(
        task_id='store_token_info', # Use consistent task ID naming
        python_callable=store_token_info,
    )
    store_info.doc_md = """
    ### Store Token Info Task
    Pulls information from XCom and DAG parameters, reads the `info.json` content,
    and stores relevant data in a Redis hash.
    **Pulls from XCom (task_id='get_token'):**
    - `ytdlp_command`
    - `socks_proxy`
    - `info_json_path`
    **Pulls from DAG context:**
    - `params['url']` (or `dag_run.conf['url']`)
    **Stores in Redis Hash (key: `token_info:<video_id>`):**
    - `url`: The video URL.
    - `ytdlp_command`: Base command using `--load-info-json`.
    - `proxy`: The SOCKS proxy string.
    - `info_json_path`: Path to the saved info.json file.
    - `info_json`: The full content of the info.json file (as a JSON string).
    - `video_id`: Extracted video ID.
    - `timestamp`: Unix timestamp of storage.
    - `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
    Sets a 24-hour expiration on the Redis key.
    """
    # Define task dependencies matching the reference DAG structure
    get_token >> check_xcom_task >> display_info >> store_info
--- a/dags/ytdlp_mgmt_proxy.py
+++ b/dags/ytdlp_mgmt_proxy.py
@ -0,0 +1,197 @@
 """
 DAG to manage the state of proxies used by the ytdlp-ops-server.
 """
 from __future__ import annotations
 import logging
 from datetime import datetime
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator
 from airflow.utils.dates import days_ago
 # Configure logging
 logger = logging.getLogger(__name__)
 # Import and apply Thrift exceptions patch for Airflow compatibility
 try:
    from thrift_exceptions_patch import patch_thrift_exceptions
    patch_thrift_exceptions()
    logger.info("Applied Thrift exceptions patch for Airflow compatibility.")
 except ImportError:
    logger.warning("Could not import thrift_exceptions_patch. Compatibility may be affected.")
 except Exception as e:
    logger.error(f"Error applying Thrift exceptions patch: {e}")
 # Thrift imports
 try:
    from thrift.transport import TSocket, TTransport
    from thrift.protocol import TBinaryProtocol
    from pangramia.yt.tokens_ops import YTTokenOpService
    from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
 except ImportError as e:
    logger.critical(f"Could not import Thrift modules: {e}. Ensure ytdlp-ops-auth package is installed.")
    # Fail DAG parsing if thrift modules are not available
    raise
 def format_timestamp(ts_str: str) -> str:
    """Formats a string timestamp into a human-readable date string."""
    if not ts_str:
        return ""
    try:
        ts_float = float(ts_str)
        if ts_float <= 0:
            return ""
        # Use datetime from the imported 'from datetime import datetime'
        dt_obj = datetime.fromtimestamp(ts_float)
        return dt_obj.strftime('%Y-%m-%d %H:%M:%S')
    except (ValueError, TypeError):
        return ts_str  # Return original string if conversion fails
 def get_thrift_client(host: str, port: int):
    """Helper function to create and connect a Thrift client."""
    transport = TSocket.TSocket(host, port)
    transport = TTransport.TFramedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    client = YTTokenOpService.Client(protocol)
    transport.open()
    logger.info(f"Connected to Thrift server at {host}:{port}")
    return client, transport
 def manage_proxies_callable(**context):
    """Main callable to interact with the proxy management endpoints."""
    params = context["params"]
    action = params["action"]
    host = params["host"]
    port = params["port"]
    server_identity = params.get("server_identity")
    proxy_url = params.get("proxy_url")
    if not server_identity and action in ["ban", "unban", "reset_all"]:
        raise ValueError(f"A 'server_identity' is required for the '{action}' action.")
    client, transport = None, None
    try:
        client, transport = get_thrift_client(host, port)
        if action == "list":
            logger.info(f"Listing proxy statuses for server: {server_identity or 'ALL'}")
            statuses = client.getProxyStatus(server_identity)
            if not statuses:
                logger.info("No proxy statuses found.")
                print("No proxy statuses found.")
            else:
                from tabulate import tabulate
                status_list = [
                    {
                        "Server": s.serverIdentity,
                        "Proxy URL": s.proxyUrl,
                        "Status": s.status,
                        "Success": s.successCount,
                        "Failures": s.failureCount,
                        "Last Success": format_timestamp(s.lastSuccessTimestamp),
                        "Last Failure": format_timestamp(s.lastFailureTimestamp),
                    }
                    for s in statuses
                ]
                print("\n--- Proxy Statuses ---")
                print(tabulate(status_list, headers="keys", tablefmt="grid"))
                print("----------------------\n")
        elif action == "ban":
            if not proxy_url:
                raise ValueError("A 'proxy_url' is required to ban a proxy.")
            logger.info(f"Banning proxy '{proxy_url}' for server '{server_identity}'...")
            success = client.banProxy(proxy_url, server_identity)
            if success:
                logger.info("Successfully banned proxy.")
                print(f"Successfully banned proxy '{proxy_url}' for server '{server_identity}'.")
            else:
                logger.error("Failed to ban proxy.")
                raise Exception("Server returned failure for banProxy operation.")
        elif action == "unban":
            if not proxy_url:
                raise ValueError("A 'proxy_url' is required to unban a proxy.")
            logger.info(f"Unbanning proxy '{proxy_url}' for server '{server_identity}'...")
            success = client.unbanProxy(proxy_url, server_identity)
            if success:
                logger.info("Successfully unbanned proxy.")
                print(f"Successfully unbanned proxy '{proxy_url}' for server '{server_identity}'.")
            else:
                logger.error("Failed to unban proxy.")
                raise Exception("Server returned failure for unbanProxy operation.")
        elif action == "reset_all":
            logger.info(f"Resetting all proxy statuses for server '{server_identity}'...")
            success = client.resetAllProxyStatuses(server_identity)
            if success:
                logger.info("Successfully reset all proxy statuses.")
                print(f"Successfully reset all proxy statuses for server '{server_identity}'.")
            else:
                logger.error("Failed to reset all proxy statuses.")
                raise Exception("Server returned failure for resetAllProxyStatuses operation.")
        else:
            raise ValueError(f"Invalid action: {action}")
    except (PBServiceException, PBUserException) as e:
        logger.error(f"Thrift error performing action '{action}': {e.message}", exc_info=True)
        raise
    except Exception as e:
        logger.error(f"Error performing action '{action}': {e}", exc_info=True)
        raise
    finally:
        if transport and transport.isOpen():
            transport.close()
            logger.info("Thrift connection closed.")
 with DAG(
    dag_id="ytdlp_mgmt_proxy",
    start_date=days_ago(1),
    schedule=None,
    catchup=False,
    tags=["ytdlp", "utility", "proxy"],
    doc_md="""
    ### YT-DLP Proxy Manager DAG
    This DAG provides tools to manage the state of proxies used by the `ytdlp-ops-server`.
    You can view statuses, and manually ban, unban, or reset proxies for a specific server instance.
    **Parameters:**
    - `host`: The hostname or IP of the `ytdlp-ops-server` Thrift service.
    - `port`: The port of the Thrift service.
    - `action`: The operation to perform.
        - `list`: List proxy statuses. Provide a `server_identity` to query a specific server, or leave it blank to query the server instance you are connected to.
        - `ban`: Ban a specific proxy. Requires `server_identity` and `proxy_url`.
        - `unban`: Un-ban a specific proxy. Requires `server_identity` and `proxy_url`.
        - `reset_all`: Reset all proxies for a server to `ACTIVE`. Requires `server_identity`.
    - `server_identity`: The unique identifier for the server instance (e.g., `ytdlp-ops-airflow-service`).
    - `proxy_url`: The full URL of the proxy to act upon (e.g., `socks5://host:port`).
    """,
    params={
        "host": Param("89.253.221.173", type="string", description="The hostname of the ytdlp-ops-server service."),
        "port": Param(9090, type="integer", description="The port of the ytdlp-ops-server service."),
        "action": Param(
            "list",
            type="string",
            enum=["list", "ban", "unban", "reset_all"],
            description="The management action to perform.",
        ),
        "server_identity": Param(
            "ytdlp-ops-airflow-service",
            type=["null", "string"],
            description="The identity of the server to manage. Leave blank to query the connected server instance.",
        ),
        "proxy_url": Param(
            None,
            type=["null", "string"],
            description="The proxy URL to ban/unban (e.g., 'socks5://host:port').",
        ),
    },
 ) as dag:
    proxy_management_task = PythonOperator(
        task_id="proxy_management_task",
        python_callable=manage_proxies_callable,
    )
--- a/dags/ytdlp_mgmt_queue_add_urls.py
+++ b/dags/ytdlp_mgmt_queue_add_urls.py
@ -1,174 +0,0 @@
 from airflow import DAG
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.utils.dates import days_ago
 from airflow.exceptions import AirflowException
 from datetime import timedelta
 import logging
 import redis # Import redis exceptions if needed
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_QUEUE_NAME = 'video_queue' # Default base name for the queue
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 # --- Python Callables for Tasks ---
 def add_urls_callable(**context):
    """Adds URLs from comma/newline separated input to the specified Redis inbox list."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_name = params['queue_name']
    inbox_queue = f"{queue_name}_inbox"
    urls_input = params['urls']
    if not urls_input or not isinstance(urls_input, str):
        logger.warning("No URLs provided or 'urls' parameter is not a string. Nothing to add.")
        return
    # Process input: split by newline, then by comma, flatten, strip, and filter empty
    urls_to_add = []
    for line in urls_input.splitlines():
        urls_to_add.extend(url.strip() for url in line.split(',') if url.strip())
    # Remove duplicates while preserving order (optional, but good practice)
    seen = set()
    urls_to_add = [x for x in urls_to_add if not (x in seen or seen.add(x))]
    if not urls_to_add:
        logger.info("No valid URLs found after processing input. Nothing added.")
        return
    logger.info(f"Attempting to add {len(urls_to_add)} unique URLs to Redis list '{inbox_queue}' using connection '{redis_conn_id}'.")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        # Use rpush to add to the end of the list (FIFO behavior with lpop)
        added_count = redis_client.rpush(inbox_queue, *urls_to_add)
        logger.info(f"Successfully added {len(urls_to_add)} URLs to list '{inbox_queue}'. New list length: {added_count}.")
    except Exception as e:
        logger.error(f"Failed to add URLs to Redis list '{inbox_queue}': {e}", exc_info=True)
        raise AirflowException(f"Failed to add URLs to Redis: {e}")
 # Removed clear_queue_callable as this DAG focuses on adding and verifying
 def check_status_callable(**context):
    """Checks the type and length/size of the specified Redis inbox key."""
    # Access DAG run parameters directly from context['params']
    dag_params = context['params']
    redis_conn_id = dag_params['redis_conn_id']
    # This DAG verifies the inbox queue, so we construct the name from the base name
    queue_name = dag_params['queue_name']
    queue_to_check = f"{queue_name}_inbox"
    if not queue_name:
        raise ValueError("DAG parameter 'queue_name' (base name) cannot be empty.")
    logger.info(f"Attempting to check status of Redis key '{queue_to_check}' using connection '{redis_conn_id}'.")
    try:
        # Use the resolved redis_conn_id to get the client
        redis_client = _get_redis_client(redis_conn_id)
        # redis_client.type returns bytes (e.g., b'list', b'hash', b'none')
        key_type_bytes = redis_client.type(queue_to_check)
        key_type_str = key_type_bytes.decode('utf-8') # Decode to string
        length = 0
        if key_type_str == 'list':
            length = redis_client.llen(queue_to_check)
            logger.info(f"Redis list '{queue_to_check}' has {length} items.")
        elif key_type_str == 'hash':
            length = redis_client.hlen(queue_to_check)
            logger.info(f"Redis hash '{queue_to_check}' has {length} fields.")
        elif key_type_str == 'none': # Check against the decoded string 'none'
            logger.info(f"Redis key '{queue_to_check}' does not exist.")
        else:
            # Attempt to get size for other types if possible, e.g., set size
            try:
                if key_type_str == 'set':
                    length = redis_client.scard(queue_to_check)
                    logger.info(f"Redis set '{queue_to_check}' has {length} members.")
                # Add checks for other types like zset if needed
                else:
                    logger.info(f"Redis key '{queue_to_check}' exists but is of unhandled type '{key_type_str}'. Cannot determine size.")
            except Exception as size_error:
                 logger.warning(f"Could not determine size for Redis key '{queue_to_check}' (type: {key_type_str}): {size_error}")
                 logger.info(f"Redis key '{queue_to_check}' exists but is of unhandled/unsizeable type '{key_type_str}'.")
        # Push results to XCom
        context['task_instance'].xcom_push(key='queue_key_type', value=key_type_str)
        context['task_instance'].xcom_push(key='queue_size', value=length)
        # Return status info using the resolved queue_to_check
        return {'key': queue_to_check, 'type': key_type_str, 'size': length}
    except Exception as e:
        # Log error using the resolved queue_to_check
        logger.error(f"Failed to check status of Redis key '{queue_to_check}': {e}", exc_info=True)
        raise AirflowException(f"Failed to check Redis key status: {e}")
 # --- DAG Definition ---
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1), # Slightly longer retry delay for management tasks
    'start_date': days_ago(1)
 }
 # This single DAG contains operators for different management actions,
 # This DAG allows adding URLs and then checking the status of the target queue.
 with DAG(
    dag_id='ytdlp_mgmt_queue_add_and_verify', # Updated DAG ID
    default_args=default_args,
    schedule_interval=None, # Manually triggered
    catchup=False,
    description='Manually add URLs to a YTDLP inbox queue and verify the queue status.', # Updated description
    tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'add', 'verify'], # Updated tags
    params={
        # Common params
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
        # Params for adding URLs (and checking the same queue)
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", title="Base Queue Name", description="Base name for the Redis queues (e.g., 'video_queue'). The DAG will add URLs to '{base_name}_inbox'."),
        'urls': Param("", type="string", title="URLs to Add", description="Comma and/or newline separated list of video URLs.", multiline=True), # Updated description, keep multiline for UI
        # Removed clear_queue_name param
        # Removed check_queue_name param (will use queue_name)
    }
 ) as dag:
    add_urls_task = PythonOperator(
        task_id='add_urls_to_queue',
        python_callable=add_urls_callable,
        # Pass only relevant params to the callable via context['params']
        # Note: context['params'] automatically contains all DAG params
    )
    add_urls_task.doc_md = """
    ### Add URLs to Queue
    Adds URLs from the `urls` parameter (comma/newline separated) to the Redis list specified by `queue_name`.
    *Trigger this task manually via the UI and provide the URLs.*
    """
    # Removed clear_queue_task
    check_status_task = PythonOperator(
        task_id='check_queue_status_after_add',
        python_callable=check_status_callable,
        # No task-specific params needed; callable uses context['params'] directly.
    )
    check_status_task.doc_md = """
    ### Check Queue Status After Add
    Checks the type and length/size of the Redis key specified by `queue_name` (the same queue URLs were added to).
    Logs the result and pushes `queue_key_type` and `queue_size` to XCom.
    *This task runs automatically after `add_urls_to_queue`.*
    """
    # Define dependency: Add URLs first, then check status
    add_urls_task >> check_status_task
--- a/dags/ytdlp_mgmt_queue_check_status.py
+++ b/dags/ytdlp_mgmt_queue_check_status.py
@ -1,179 +0,0 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 Airflow DAG for manually checking the status (type and size) of a specific Redis key used by YTDLP queues.
 """
 from airflow import DAG
 from airflow.exceptions import AirflowException
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.utils.dates import days_ago
 from datetime import datetime, timedelta, timezone
 import logging
 import json
 import redis  # Import redis exceptions if needed
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_QUEUE_BASE_NAME = 'video_queue'
 DEFAULT_MAX_ITEMS_TO_LIST = 25
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # --- Python Callable for Check and List Task ---
 def check_and_list_queue_callable(**context):
    """Checks the type and size of a Redis key and lists its recent contents."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    # queue_suffix is passed from the PythonOperator's op_kwargs, which are available in the context
    queue_suffix = context['queue_suffix']
    queue_name = params.get('queue_name', DEFAULT_QUEUE_BASE_NAME)
    queue_to_check = f"{queue_name}{queue_suffix}"
    max_items = int(params.get('max_items_to_list', DEFAULT_MAX_ITEMS_TO_LIST))
    logger.info(f"--- Checking Status and Contents of Redis Key: '{queue_to_check}' ---")
    logger.info(f"Using connection '{redis_conn_id}', listing up to {max_items} items.")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        key_type_bytes = redis_client.type(queue_to_check)
        key_type = key_type_bytes.decode('utf-8')
        if key_type == 'list':
            list_length = redis_client.llen(queue_to_check)
            logger.info(f"Redis key '{queue_to_check}' is a LIST with {list_length} items.")
            if list_length > 0:
                items_to_fetch = min(max_items, list_length)
                # lrange with negative indices gets items from the end (most recent for rpush)
                contents_bytes = redis_client.lrange(queue_to_check, -items_to_fetch, -1)
                contents = [item.decode('utf-8') for item in contents_bytes]
                contents.reverse()  # Show most recent first
                logger.info(f"--- Showing most recent {len(contents)} of {list_length} items ---")
                for i, item in enumerate(contents):
                    logger.info(f"  [recent_{i}]: {item}")
                if list_length > len(contents):
                    logger.info(f"  ... ({list_length - len(contents)} older items not shown)")
                logger.info(f"--- End of List Contents ---")
        elif key_type == 'hash':
            hash_size = redis_client.hlen(queue_to_check)
            logger.info(f"Redis key '{queue_to_check}' is a HASH with {hash_size} fields.")
            if hash_size > 0:
                logger.info(f"--- Showing a sample of up to {max_items} fields ---")
                item_count = 0
                # Using hscan_iter to safely iterate over hash fields, count is a hint
                for field_bytes, value_bytes in redis_client.hscan_iter(queue_to_check, count=max_items):
                    if item_count >= max_items:
                        logger.info(f"  ... (stopped listing after {max_items} items of {hash_size})")
                        break
                    field = field_bytes.decode('utf-8')
                    value = value_bytes.decode('utf-8')
                    # Try to pretty-print if value is JSON
                    try:
                        parsed_value = json.loads(value)
                        # Check for timestamp to show age
                        timestamp = parsed_value.get('end_time') or parsed_value.get('start_time')
                        age_str = ""
                        if timestamp:
                            age_seconds = (datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, timezone.utc)).total_seconds()
                            age_str = f" (age: {timedelta(seconds=age_seconds)})"
                        pretty_value = json.dumps(parsed_value, indent=2)
                        logger.info(f"  Field '{field}'{age_str}:\n{pretty_value}")
                    except (json.JSONDecodeError, TypeError):
                        logger.info(f"  Field '{field}': {value}")
                    item_count += 1
                logger.info(f"--- End of Hash Contents ---")
        elif key_type == 'none':
            logger.info(f"Redis key '{queue_to_check}' does not exist.")
        else:
            logger.info(f"Redis key '{queue_to_check}' is of type '{key_type}'. Listing contents for this type is not implemented.")
    except Exception as e:
        logger.error(f"Failed to check/list contents of Redis key '{queue_to_check}': {e}", exc_info=True)
        raise AirflowException(f"Failed to process Redis key: {e}")
 # --- DAG Definition ---
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,  # No retries for a manual check/list operation
    'start_date': days_ago(1)
 }
 with DAG(
    dag_id='ytdlp_mgmt_queues_check_status',
    default_args=default_args,
    schedule_interval=None,  # Manually triggered
    catchup=False,
    description='Manually check the status and recent items of all YTDLP Redis queues for a given base name.',
    tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'status', 'list'],
    params={
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
        'queue_name': Param(
            DEFAULT_QUEUE_BASE_NAME,
            type="string",
            description="Base name for the Redis queues (e.g., 'video_queue')."
        ),
        'max_items_to_list': Param(DEFAULT_MAX_ITEMS_TO_LIST, type="integer", description="Maximum number of recent items/fields to list from each queue."),
    }
 ) as dag:
    check_inbox_queue = PythonOperator(
        task_id='check_inbox_queue',
        python_callable=check_and_list_queue_callable,
        op_kwargs={'queue_suffix': '_inbox'},
    )
    check_inbox_queue.doc_md = """
    ### Check Inbox Queue (`_inbox`)
    Checks the status and lists the most recent URLs waiting to be processed.
    The full queue name is `{{ params.queue_name }}_inbox`.
    """
    check_progress_queue = PythonOperator(
        task_id='check_progress_queue',
        python_callable=check_and_list_queue_callable,
        op_kwargs={'queue_suffix': '_progress'},
    )
    check_progress_queue.doc_md = """
    ### Check Progress Queue (`_progress`)
    Checks the status and lists a sample of URLs currently being processed.
    The full queue name is `{{ params.queue_name }}_progress`.
    """
    check_result_queue = PythonOperator(
        task_id='check_result_queue',
        python_callable=check_and_list_queue_callable,
        op_kwargs={'queue_suffix': '_result'},
    )
    check_result_queue.doc_md = """
    ### Check Result Queue (`_result`)
    Checks the status and lists a sample of successfully processed URLs.
    The full queue name is `{{ params.queue_name }}_result`.
    """
    check_fail_queue = PythonOperator(
        task_id='check_fail_queue',
        python_callable=check_and_list_queue_callable,
        op_kwargs={'queue_suffix': '_fail'},
    )
    check_fail_queue.doc_md = """
    ### Check Fail Queue (`_fail`)
    Checks the status and lists a sample of failed URLs.
    The full queue name is `{{ params.queue_name }}_fail`.
    """
--- a/dags/ytdlp_mgmt_queue_clear.py
+++ b/dags/ytdlp_mgmt_queue_clear.py
@ -1,99 +0,0 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 Airflow DAG for manually clearing (deleting) a specific Redis key used by YTDLP queues.
 """
 from airflow import DAG
 from airflow.exceptions import AirflowException
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.utils.dates import days_ago
 from datetime import timedelta
 import logging
 import redis # Import redis exceptions if needed
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 # Provide a placeholder default, user MUST specify the queue to clear
 DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # --- Python Callable for Clear Task ---
 def clear_queue_callable(**context):
    """Clears (deletes) the specified Redis key (queue/hash)."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_to_clear = params['queue_to_clear'] # Specific queue/hash name
    if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
        raise ValueError("Parameter 'queue_to_clear' must be specified and cannot be the default placeholder.")
    logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        deleted_count = redis_client.delete(queue_to_clear)
        if deleted_count > 0:
            logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
        else:
            logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
    except Exception as e:
        logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
        raise AirflowException(f"Failed to clear Redis key: {e}")
 # --- DAG Definition ---
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0, # No retries for manual clear operation
    'start_date': days_ago(1)
 }
 with DAG(
    dag_id='ytdlp_mgmt_queue_clear',
    default_args=default_args,
    schedule_interval=None, # Manually triggered
    catchup=False,
    description='Manually clear/delete a specific YTDLP Redis queue/key (inbox, progress, result, fail). Use with caution!',
    tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'clear'],
    params={
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
        'queue_to_clear': Param(
            DEFAULT_QUEUE_TO_CLEAR,
            type="string",
            description="Exact name of the Redis key to clear (e.g., 'video_queue_inbox_account_xyz', 'video_queue_progress', 'video_queue_result', 'video_queue_fail')."
        ),
    }
 ) as dag:
    clear_queue_task = PythonOperator(
        task_id='clear_specified_queue',
        python_callable=clear_queue_callable,
        # Params are implicitly passed via context['params']
    )
    clear_queue_task.doc_md = """
    ### Clear Specified Queue/Key Task
    Deletes the Redis key specified by the `queue_to_clear` parameter.
    This can target any key, including:
    - `_inbox` (Redis List): Contains URLs waiting to be processed.
    - `_progress` (Redis Hash): Contains URLs currently being processed.
    - `_result` (Redis Hash): Contains details of successfully processed URLs.
    - `_fail` (Redis Hash): Contains details of failed URLs.
    **Warning:** This operation is destructive and cannot be undone. Ensure you specify the correct key name.
    *Trigger this task manually via the UI.*
    """
--- a/dags/ytdlp_mgmt_queue_list_contents.py
+++ b/dags/ytdlp_mgmt_queue_list_contents.py
@ -1,151 +0,0 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 Airflow DAG for manually listing the contents of a specific Redis key used by YTDLP queues.
 """
 from airflow import DAG
 from airflow.exceptions import AirflowException
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.utils.dates import days_ago
 from datetime import timedelta
 import logging
 import json
 import redis # Import redis exceptions if needed
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 # Default to a common inbox pattern, user should override with the specific key
 DEFAULT_QUEUE_TO_LIST = 'video_queue_inbox'
 DEFAULT_MAX_ITEMS = 10 # Limit number of items listed by default
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # --- Python Callable for List Contents Task ---
 def list_contents_callable(**context):
    """Lists the contents of the specified Redis key (list or hash)."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_to_list = params['queue_to_list']
    max_items = params.get('max_items', DEFAULT_MAX_ITEMS)
    if not queue_to_list:
        raise ValueError("Parameter 'queue_to_list' cannot be empty.")
    logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        key_type_bytes = redis_client.type(queue_to_list)
        key_type = key_type_bytes.decode('utf-8') # Decode type
        if key_type == 'list':
            list_length = redis_client.llen(queue_to_list)
            # Get the last N items, which are the most recently added with rpush
            items_to_fetch = min(max_items, list_length)
            # lrange with negative indices gets items from the end of the list.
            # -N to -1 gets the last N items.
            contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
            contents = [item.decode('utf-8') for item in contents_bytes]
            # Reverse the list so the absolute most recent item is printed first
            contents.reverse()
            logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
            for i, item in enumerate(contents):
                # The index here is just for display, 0 is the most recent
                logger.info(f"  [recent_{i}]: {item}")
            if list_length > len(contents):
                logger.info(f"  ... ({list_length - len(contents)} older items not shown)")
            logger.info(f"--- End of List Contents ---")
            # Optionally push contents to XCom if small enough
            # context['task_instance'].xcom_push(key='list_contents', value=contents)
        elif key_type == 'hash':
            hash_size = redis_client.hlen(queue_to_list)
            # HGETALL can be risky for large hashes. Consider HSCAN for production.
            # For manual inspection, HGETALL is often acceptable.
            if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
                 logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
                 # Optionally implement HSCAN here for large hashes
            # hgetall returns dict of bytes keys and bytes values, decode them
            contents_bytes = redis_client.hgetall(queue_to_list)
            contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
            logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
            item_count = 0
            for key, value in contents.items(): # key and value are now strings
                if item_count >= max_items:
                    logger.info(f"  ... (stopped listing after {max_items} items of {hash_size})")
                    break
                # Attempt to pretty-print if value is JSON
                try:
                    parsed_value = json.loads(value)
                    pretty_value = json.dumps(parsed_value, indent=2)
                    logger.info(f"  '{key}':\n{pretty_value}")
                except json.JSONDecodeError:
                    logger.info(f"  '{key}': {value}") # Print as string if not JSON
                item_count += 1
            logger.info(f"--- End of Hash Contents ---")
            # Optionally push contents to XCom if small enough
            # context['task_instance'].xcom_push(key='hash_contents', value=contents)
        elif key_type == 'none':
            logger.info(f"Redis key '{queue_to_list}' does not exist.")
        else:
            logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
    except Exception as e:
        logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
        raise AirflowException(f"Failed to list Redis key contents: {e}")
 # --- DAG Definition ---
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0, # No retries for manual list operation
    'start_date': days_ago(1)
 }
 with DAG(
    dag_id='ytdlp_mgmt_queue_list_contents',
    default_args=default_args,
    schedule_interval=None, # Manually triggered
    catchup=False,
    description='Manually list the contents of a specific YTDLP Redis queue/key (list or hash).',
    tags=['ytdlp', 'queue', 'management', 'redis', 'manual', 'list'],
    params={
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
        'queue_to_list': Param(
            DEFAULT_QUEUE_TO_LIST,
            type="string",
            description="Exact name of the Redis key (list/hash) to list contents for (e.g., 'video_queue_inbox_account_xyz', 'video_queue_progress', etc.)."
        ),
        'max_items': Param(DEFAULT_MAX_ITEMS, type="integer", description="Maximum number of items/fields to list. For lists, shows the most recent items."),
    }
 ) as dag:
    list_contents_task = PythonOperator(
        task_id='list_specified_queue_contents',
        python_callable=list_contents_callable,
        # Params are implicitly passed via context['params']
    )
    list_contents_task.doc_md = """
    ### List Specified Queue/Key Contents Task
    Lists the contents of the Redis key specified by `queue_to_list`.
    - For **Lists** (e.g., `_inbox`), shows the first `max_items`.
    - For **Hashes** (e.g., `_progress`, `_result`, `_fail`), shows up to `max_items` key-value pairs. Attempts to pretty-print JSON values.
    - Logs a warning for very large hashes.
    *Trigger this task manually via the UI.*
    """
--- a/dags/ytdlp_mgmt_queues.py
+++ b/dags/ytdlp_mgmt_queues.py
@ -0,0 +1,493 @@
 # -*- coding: utf-8 -*-
 """
 Airflow DAG for manually adding YouTube URLs or Video IDs to a Redis queue.
 """
 from __future__ import annotations
 import json
 import logging
 import re
 from typing import List, Optional
 import csv
 import os
 from datetime import datetime
 from airflow.exceptions import AirflowException
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.operators.python import PythonOperator, BranchPythonOperator
 from airflow.operators.empty import EmptyOperator
 from airflow.providers.redis.hooks.redis import RedisHook
 from airflow.utils.dates import days_ago
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_REDIS_CONN_ID = "redis_default"
 DEFAULT_QUEUE_NAME = "video_queue"
 DEFAULT_QUEUE_TO_CLEAR = 'PLEASE_SPECIFY_QUEUE_TO_CLEAR'
 # --- Helper Functions ---
 def _get_redis_client(redis_conn_id: str):
    """Gets a Redis client from an Airflow connection."""
    try:
        redis_hook = RedisHook(redis_conn_id=redis_conn_id)
        return redis_hook.get_conn()
    except Exception as e:
        logger.error(f"Failed to connect to Redis using connection '{redis_conn_id}': {e}")
        raise AirflowException(f"Redis connection failed: {e}")
 def parse_video_inputs(input_str: str) -> List[str]:
    """Parses a flexible string of video inputs into a list of individual items."""
    if not input_str or not isinstance(input_str, str):
        return []
    input_str = input_str.strip()
    # 1. Try to parse as a JSON array
    if input_str.startswith("[") and input_str.endswith("]"):
        try:
            items = json.loads(input_str)
            if isinstance(items, list):
                logger.info("Successfully parsed input as a JSON array.")
                return [str(item).strip() for item in items]
        except json.JSONDecodeError:
            logger.warning("Input looked like a JSON array but failed to parse. Treating as a comma-separated string.")
    # 2. Treat as a comma-separated string
    items = [item.strip() for item in input_str.split(",")]
    # 3. Clean up quotes and extra whitespace from each item
    cleaned_items = []
    for item in items:
        if item.startswith(('"', "'")) and item.endswith(('"', "'")):
            item = item[1:-1]
        if item:  # Only add non-empty items
            cleaned_items.append(item.strip())
    return cleaned_items
 def normalize_to_url(item: str) -> Optional[str]:
    """
    Validates if an item is a recognizable YouTube URL or video ID,
    and normalizes it to a standard watch URL format.
    """
    if not item:
        return None
    # Regex for a standard 11-character YouTube video ID
    video_id_pattern = r"^[a-zA-Z0-9_-]{11}$"
    # Check if the item itself is a video ID
    if re.match(video_id_pattern, item):
        video_id = item
        return f"https://www.youtube.com/watch?v={video_id}"
    # Comprehensive regex to extract video ID from various URL formats
    # Covers: watch, youtu.be, shorts, embed, /v/
    url_patterns = [
        r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})"
    ]
    for pattern in url_patterns:
        match = re.search(pattern, item)
        if match:
            video_id = match.group(1)
            return f"https://www.youtube.com/watch?v={video_id}"
    logger.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.")
    return None
 def dump_redis_data_to_csv(redis_client, dump_dir, patterns):
    """Dumps data from Redis keys matching patterns to separate CSV files in a timestamped directory."""
    timestamp_dir = datetime.now().strftime('%Y%m%d_%H%M%S')
    full_dump_path = os.path.join(dump_dir, timestamp_dir)
    os.makedirs(full_dump_path, exist_ok=True)
    logger.info(f"Created dump directory: {full_dump_path}")
    for pattern in patterns:
        if not pattern: continue
        # Sanitize pattern for filename
        sanitized_pattern = re.sub(r'[^a-zA-Z0-9_-]', '_', pattern)
        timestamp_file = datetime.now().strftime('%Y%m%d')
        dump_file_name = f'redis_dump_{sanitized_pattern}_{timestamp_file}.csv'
        dump_file_path = os.path.join(full_dump_path, dump_file_name)
        logger.info(f"Dumping keys matching '{pattern}' to {dump_file_path}")
        try:
            with open(dump_file_path, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['key', 'type', 'field_or_index', 'value'])
                keys_found = 0
                for key_bytes in redis_client.scan_iter(pattern):
                    key = key_bytes.decode('utf-8')
                    keys_found += 1
                    key_type = redis_client.type(key).decode('utf-8')
                    if key_type == 'hash':
                        for field, value in redis_client.hgetall(key).items():
                            writer.writerow([key, key_type, field.decode('utf-8'), value.decode('utf-8')])
                    elif key_type == 'list':
                        for index, value in enumerate(redis_client.lrange(key, 0, -1)):
                            writer.writerow([key, key_type, index, value.decode('utf-8')])
                    elif key_type == 'set':
                        for member in redis_client.smembers(key):
                            writer.writerow([key, key_type, None, member.decode('utf-8')])
                    elif key_type == 'string':
                        value = redis_client.get(key)
                        if value:
                            writer.writerow([key, key_type, None, value.decode('utf-8')])
                if keys_found > 0:
                    logger.info(f"Successfully dumped {keys_found} keys for pattern '{pattern}' to {dump_file_path}")
                else:
                    logger.info(f"No keys found for pattern '{pattern}'. Empty CSV file created at {dump_file_path}")
        except Exception as e:
            logger.error(f"Failed to dump Redis data for pattern '{pattern}': {e}", exc_info=True)
            raise AirflowException(f"Failed to dump Redis data for pattern '{pattern}': {e}")
 def clear_queue_callable(**context):
    """Dumps Redis data to CSV and/or clears a specified Redis key."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_to_clear = params['queue_to_clear']
    dump_queues = params['dump_queues']
    # Get the rendered dump_dir from the templates_dict passed to the operator
    dump_dir = context['templates_dict']['dump_dir']
    dump_patterns = params['dump_patterns'].split(',') if params.get('dump_patterns') else []
    redis_client = _get_redis_client(redis_conn_id)
    if dump_queues and dump_patterns:
        dump_redis_data_to_csv(redis_client, dump_dir, dump_patterns)
    if not queue_to_clear or queue_to_clear == DEFAULT_QUEUE_TO_CLEAR:
        logger.info("Parameter 'queue_to_clear' is not specified or is the default placeholder. Skipping key deletion.")
        # If we only wanted to dump, this is a success.
        return
    logger.info(f"Attempting to clear Redis key '{queue_to_clear}' using connection '{redis_conn_id}'.")
    try:
        deleted_count = redis_client.delete(queue_to_clear)
        if deleted_count > 0:
            logger.info(f"Successfully cleared Redis key '{queue_to_clear}'.")
        else:
            logger.info(f"Redis key '{queue_to_clear}' did not exist or was already empty.")
    except Exception as e:
        logger.error(f"Failed to clear Redis key '{queue_to_clear}': {e}", exc_info=True)
        raise AirflowException(f"Failed to clear Redis key: {e}")
 def list_contents_callable(**context):
    """Lists the contents of the specified Redis key (list or hash)."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_to_list = params['queue_to_list']
    max_items = params.get('max_items', 10)
    if not queue_to_list:
        raise ValueError("Parameter 'queue_to_list' cannot be empty.")
    logger.info(f"Attempting to list contents of Redis key '{queue_to_list}' (max: {max_items}) using connection '{redis_conn_id}'.")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        key_type_bytes = redis_client.type(queue_to_list)
        key_type = key_type_bytes.decode('utf-8') # Decode type
        if key_type == 'list':
            list_length = redis_client.llen(queue_to_list)
            # Get the last N items, which are the most recently added with rpush
            items_to_fetch = min(max_items, list_length)
            # lrange with negative indices gets items from the end of the list.
            # -N to -1 gets the last N items.
            contents_bytes = redis_client.lrange(queue_to_list, -items_to_fetch, -1)
            contents = [item.decode('utf-8') for item in contents_bytes]
            # Reverse the list so the absolute most recent item is printed first
            contents.reverse()
            logger.info(f"--- Contents of Redis List '{queue_to_list}' (showing most recent {len(contents)} of {list_length}) ---")
            for i, item in enumerate(contents):
                # The index here is just for display, 0 is the most recent
                logger.info(f"  [recent_{i}]: {item}")
            if list_length > len(contents):
                logger.info(f"  ... ({list_length - len(contents)} older items not shown)")
            logger.info(f"--- End of List Contents ---")
        elif key_type == 'hash':
            hash_size = redis_client.hlen(queue_to_list)
            # HGETALL can be risky for large hashes. Consider HSCAN for production.
            # For manual inspection, HGETALL is often acceptable.
            if hash_size > max_items * 2: # Heuristic: avoid huge HGETALL
                 logger.warning(f"Hash '{queue_to_list}' has {hash_size} fields, which is large. Listing might be slow or incomplete. Consider using redis-cli HSCAN.")
            # hgetall returns dict of bytes keys and bytes values, decode them
            contents_bytes = redis_client.hgetall(queue_to_list)
            contents = {k.decode('utf-8'): v.decode('utf-8') for k, v in contents_bytes.items()}
            logger.info(f"--- Contents of Redis Hash '{queue_to_list}' ({len(contents)} fields) ---")
            item_count = 0
            for key, value in contents.items(): # key and value are now strings
                if item_count >= max_items:
                    logger.info(f"  ... (stopped listing after {max_items} items of {hash_size})")
                    break
                # Attempt to pretty-print if value is JSON
                try:
                    parsed_value = json.loads(value)
                    pretty_value = json.dumps(parsed_value, indent=2)
                    logger.info(f"  '{key}':\n{pretty_value}")
                except json.JSONDecodeError:
                    logger.info(f"  '{key}': {value}") # Print as string if not JSON
                item_count += 1
            logger.info(f"--- End of Hash Contents ---")
        elif key_type == 'none':
            logger.info(f"Redis key '{queue_to_list}' does not exist.")
        else:
            logger.info(f"Redis key '{queue_to_list}' is of type '{key_type}'. Listing contents for this type is not implemented.")
    except Exception as e:
        logger.error(f"Failed to list contents of Redis key '{queue_to_list}': {e}", exc_info=True)
        raise AirflowException(f"Failed to list Redis key contents: {e}")
 def check_status_callable(**context):
    """Checks the status (type and size) of all standard Redis queues for a given base name."""
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_name = params.get('queue_name_for_status', DEFAULT_QUEUE_NAME)
    queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
    logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        for suffix in queue_suffixes:
            queue_to_check = f"{queue_name}{suffix}"
            key_type = redis_client.type(queue_to_check).decode('utf-8')
            size = 0
            if key_type == 'list':
                size = redis_client.llen(queue_to_check)
            elif key_type == 'hash':
                size = redis_client.hlen(queue_to_check)
            if key_type != 'none':
                logger.info(f"  - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
            else:
                logger.info(f"  - Queue '{queue_to_check}': Does not exist.")
        logger.info(f"--- End of Status Check ---")
    except Exception as e:
        logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True)
        raise AirflowException(f"Failed to check queue status: {e}")
 def add_videos_to_queue_callable(**context):
    """
    Parses video inputs, normalizes them to URLs, and adds them to a Redis queue.
    """
    params = context["params"]
    video_inputs = params["video_inputs"]
    queue_name = params["queue_name"]
    redis_conn_id = params["redis_conn_id"]
    dry_run = params["dry_run"]
    if not video_inputs:
        logger.info("No video inputs provided. Nothing to do.")
        print("No video inputs provided. Nothing to do.")
        return
    raw_items = parse_video_inputs(video_inputs)
    if not raw_items:
        logger.info("Input string was empty or contained no items after parsing.")
        print("Input string was empty or contained no items after parsing.")
        return
    valid_urls = []
    for item in raw_items:
        url = normalize_to_url(item)
        if url and url not in valid_urls:
            valid_urls.append(url)
        elif not url:
            logger.warning(f"Skipping invalid input item: '{item}'")
    if not valid_urls:
        raise AirflowException("No valid YouTube URLs or IDs were found in the provided input.")
    logger.info(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
    print(f"Found {len(valid_urls)} valid and unique URLs to add to the queue:")
    for url in valid_urls:
        logger.info(f"  - {url}")
        print(f"  - {url}")
    if dry_run:
        logger.info("Dry run is enabled. Skipping Redis operation.")
        print(f"\n[DRY RUN] Would have added {len(valid_urls)} URLs to the Redis list '{queue_name}_inbox'.")
        return
    # --- Add to Redis ---
    try:
        redis_client = _get_redis_client(redis_conn_id)
        inbox_queue = f"{queue_name}_inbox"
        # Use a pipeline for atomic and efficient addition
        with redis_client.pipeline() as pipe:
            for url in valid_urls:
                pipe.rpush(inbox_queue, url)
            pipe.execute()
        final_list_length = redis_client.llen(inbox_queue)
        success_message = (
            f"Successfully added {len(valid_urls)} URLs to Redis list '{inbox_queue}'. "
            f"The list now contains {final_list_length} items."
        )
        logger.info(success_message)
        print(f"\n{success_message}")
    except Exception as e:
        logger.error(f"Failed to add URLs to Redis queue '{inbox_queue}': {e}", exc_info=True)
        raise AirflowException(f"Failed to add URLs to Redis: {e}")
 # --- DAG Definition ---
 with DAG(
    dag_id="ytdlp_mgmt_queues",
    default_args={
        "owner": "airflow",
        "start_date": days_ago(1),
        "retries": 0,
    },
    schedule=None,
    catchup=False,
    tags=["ytdlp", "queue", "management", "redis", "manual"],
    doc_md="""
    ### YT-DLP Queue Management
    This DAG provides a set of tools to manage Redis queues used by the YTDLP processing pipeline.
    Select an `action` to perform when triggering the DAG.
    **Actions:**
    - `add_videos`: Add one or more YouTube videos to a queue.
    - `clear_queue`: Dump and/or delete a specific Redis key.
    - `list_contents`: View the contents of a Redis key (list or hash).
    - `check_status`: (Placeholder) Check the overall status of the queues.
    """,
    params={
        "action": Param(
            "add_videos",
            type="string",
            enum=["add_videos", "clear_queue", "list_contents", "check_status"],
            title="Action",
            description="The management action to perform.",
        ),
        # --- Params for 'add_videos' ---
        "video_inputs": Param(
            None,
            type=["null", "string"],
            title="[add_videos] Video URLs or IDs",
            description="A single item, comma-separated list, or JSON array of YouTube URLs or Video IDs.",
        ),
        "queue_name": Param(
            DEFAULT_QUEUE_NAME,
            type="string",
            title="[add_videos] Queue Name",
            description="The base name of the Redis queue to add videos to (e.g., 'video_queue').",
        ),
        "dry_run": Param(
            False,
            type="boolean",
            title="[add_videos] Dry Run",
            description="If True, validate inputs without adding them to the queue.",
        ),
        # --- Params for 'clear_queue' ---
        "queue_to_clear": Param(
            DEFAULT_QUEUE_TO_CLEAR,
            type="string",
            title="[clear_queue] Queue to Clear",
            description="Exact name of the Redis key to delete.",
        ),
        "dump_queues": Param(
            True,
            type="boolean",
            title="[clear_queue] Dump Data",
            description="If True, dump data before clearing.",
        ),
        "dump_dir": Param(
            "{{ var.value.get('YTDLP_REDIS_DUMP_DIR', '/opt/airflow/dumps') }}",
            type="string",
            title="[clear_queue] Dump Directory",
            description="Base directory to save CSV dump files.",
        ),
        "dump_patterns": Param(
            'ytdlp:*,video_queue_*',
            type="string",
            title="[clear_queue] Dump Patterns",
            description="Comma-separated list of key patterns to dump.",
        ),
        # --- Params for 'list_contents' ---
        "queue_to_list": Param(
            'video_queue_inbox',
            type="string",
            title="[list_contents] Queue to List",
            description="Exact name of the Redis key to list.",
        ),
        "max_items": Param(
            10,
            type="integer",
            title="[list_contents] Max Items to List",
            description="Maximum number of items to show.",
        ),
        # --- Params for 'check_status' ---
        "queue_name_for_status": Param(
            DEFAULT_QUEUE_NAME,
            type="string",
            title="[check_status] Base Queue Name",
            description="Base name of the queues to check (e.g., 'video_queue').",
        ),
        # --- Common Params ---
        "redis_conn_id": Param(
            DEFAULT_REDIS_CONN_ID,
            type="string",
            title="Redis Connection ID",
        ),
    },
 ) as dag:
    branch_on_action = BranchPythonOperator(
        task_id="branch_on_action",
        python_callable=lambda **context: f"action_{context['params']['action']}",
    )
    action_add_videos = PythonOperator(
        task_id="action_add_videos",
        python_callable=add_videos_to_queue_callable,
    )
    action_clear_queue = PythonOperator(
        task_id="action_clear_queue",
        python_callable=clear_queue_callable,
        templates_dict={'dump_dir': "{{ params.dump_dir }}"},
    )
    action_list_contents = PythonOperator(
        task_id="action_list_contents",
        python_callable=list_contents_callable,
    )
    action_check_status = PythonOperator(
        task_id="action_check_status",
        python_callable=check_status_callable,
    )
    # --- Placeholder Tasks ---
    branch_on_action >> [action_add_videos, action_clear_queue, action_list_contents, action_check_status]
--- a/dags/ytdlp_sensor_redis_queue.py
+++ b/dags/ytdlp_sensor_redis_queue.py
@ -35,6 +35,19 @@ DEFAULT_MAX_URLS = '1' # Default number of URLs to process per run
 # --- Task Callables ---
 def select_account_callable(**context):
    """
    Placeholder task for future logic to dynamically select an account.
    For now, it just passes through the account_id from the DAG params.
    """
    params = context['params']
    account_id = params.get('account_id', 'default_account')
    logger.info(f"Selected account for this run: {account_id}")
    # This task could push the selected account_id to XComs in the future.
    # For now, the next task will just read it from params.
    return account_id
 def log_trigger_info_callable(**context):
    """Logs information about how the DAG run was triggered."""
    dag_run = context['dag_run']
@ -113,7 +126,8 @@ def check_queue_for_urls_batch(**context):
                    'queue_name': queue_name,
                    'redis_conn_id': redis_conn_id,
                    'max_urls_per_run': int(max_urls),
-                    'stop_on_failure': params.get('stop_on_failure', True)
+                    'stop_on_failure': params.get('stop_on_failure', True),
                    'account_id': params.get('account_id', 'default_account')
                }
                trigger_configs.append(worker_conf)
            return trigger_configs
@ -140,18 +154,19 @@ default_args = {
 }
 with DAG(
-    dag_id='ytdlp_sensor_redis_queue',
+    dag_id='ytdlp_ops_sensor_queue',
    default_args=default_args,
-    schedule_interval='*/1 * * * *', # Runs every minute and can also be triggered.
+    schedule_interval=None, # Runs only on trigger, not on a schedule.
    max_active_runs=1, # Prevent multiple sensors from running at once
    catchup=False,
-    description='Polls Redis queue every minute (and on trigger) for URLs and starts worker DAGs.',
+    description='Polls Redis queue on trigger for URLs and starts worker DAGs.',
    tags=['ytdlp', 'sensor', 'queue', 'redis', 'batch'],
    params={
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
        'max_urls_per_run': Param(DEFAULT_MAX_URLS, type="string", description="Maximum number of URLs to process in one batch."),
        'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
        'account_id': Param('default_account', type="string", description="The account ID to use for processing the batch."),
    }
 ) as dag:
@ -179,7 +194,7 @@ with DAG(
    # This operator will be dynamically expanded based on the output of poll_redis_task
    trigger_worker_dags = TriggerDagRunOperator.partial(
        task_id='trigger_worker_dags',
-        trigger_dag_id='ytdlp_worker_per_url',
+        trigger_dag_id='ytdlp_ops_worker_per_url',
        wait_for_completion=False, # Fire and forget
        doc_md="""
 ### Trigger Worker DAGs (Dynamically Mapped)
@ -191,4 +206,10 @@ This task is skipped if the polling task finds no URLs.
        conf=poll_redis_task.output
    )
-    log_trigger_info_task >> poll_redis_task >> trigger_worker_dags
+    select_account_task = PythonOperator(
        task_id='select_account',
        python_callable=select_account_callable,
    )
    select_account_task.doc_md = "### Select Account\n(Placeholder for future dynamic account selection logic)"
    log_trigger_info_task >> select_account_task >> poll_redis_task >> trigger_worker_dags
--- a/dags/ytdlp_ops_worker_per_url.py
+++ b/dags/ytdlp_ops_worker_per_url.py
@ -74,13 +74,51 @@ def _extract_video_id(url):
 # --- Queue Management Callables (for success/failure reporting) ---
-def handle_success(**context):
+def mark_proxy_banned_callable(**context):
    """Makes a Thrift call to ban a proxy if the get_token task failed with a bannable error."""
    ti = context['task_instance']
    proxy_to_ban = ti.xcom_pull(task_ids='get_token', key='proxy_to_ban')
    if not proxy_to_ban:
        logger.info("No proxy to ban was pushed to XCom. Skipping task.")
        raise AirflowSkipException("No proxy to ban was identified in the upstream failure.")
    server_identity = ti.xcom_pull(task_ids='get_token', key='server_identity_for_ban')
    host = ti.xcom_pull(task_ids='get_token', key='service_host_for_ban')
    port = ti.xcom_pull(task_ids='get_token', key='service_port_for_ban')
    if not all([server_identity, host, port]):
        logger.error("Missing connection details (identity, host, or port) from XCom. Cannot ban proxy.")
        raise AirflowException("Missing connection details to ban proxy.")
    logger.warning(f"Attempting to ban proxy '{proxy_to_ban}' for server '{server_identity}' at {host}:{port}.")
    transport = None
    try:
        socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET)
        socket_conn.setTimeout(15 * 1000)  # 15s timeout for ban call
        transport = TTransport.TFramedTransport(socket_conn)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = YTTokenOpService.Client(protocol)
        transport.open()
        client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity)
        logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.")
    except Exception as ban_exc:
        logger.error(f"Failed to send ban request for proxy '{proxy_to_ban}': {ban_exc}", exc_info=True)
        # We should fail the task if the ban call fails, as it's an important side-effect.
        raise AirflowException(f"Failed to ban proxy: {ban_exc}")
    finally:
        if transport and transport.isOpen():
            transport.close()
 def mark_url_as_success(**context):
    """Moves URL from progress to result hash on success."""
    ti = context['task_instance']
    params = context['params']
    url = params.get('url') # Get URL from params, not XCom
    if not url:
-        logger.warning("handle_success called but no URL found in DAG run parameters.")
+        logger.warning("mark_url_as_success called but no URL found in DAG run parameters.")
        return
    queue_name = params['queue_name']
@ -91,7 +129,7 @@ def handle_success(**context):
    info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
    socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
    ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
-    downloaded_file_path = ti.xcom_pull(task_ids='download_video')
+    downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe')
    logger.info(f"Handling success for URL: {url}")
    logger.info(f"  Downloaded File Path: {downloaded_file_path}")
@ -117,7 +155,8 @@ def handle_success(**context):
        logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
        # Log error but don't fail the task, as the main work succeeded.
-def handle_failure(**context):
+
 def mark_url_as_failed(**context):
    """
    Handles failed processing. Records detailed error information to the fail hash
    and, if stop_on_failure is True, fails the task to make the DAG run failure visible.
@ -126,42 +165,72 @@ def handle_failure(**context):
    params = context['params']
    url = params.get('url') # Get URL from params
    if not url:
-        logger.error("handle_failure called but no URL found in DAG run parameters.")
+        logger.error("mark_url_as_failed called but no URL found in DAG run parameters.")
        return
    queue_name = params['queue_name']
    fail_queue = f"{queue_name}_fail"
    inbox_queue = f"{queue_name}_inbox"
    redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
    requeue_on_failure = params.get('requeue_on_failure', False)
    stop_on_failure = params.get('stop_on_failure', True)
    # Determine if we should requeue based on various parameters
    should_requeue = params.get('requeue_on_failure', False)
    requeue_on_bannable_error = params.get('requeue_on_bannable_error', False)
    requeue_on_ffprobe_failure = params.get('requeue_on_ffprobe_failure', False)
    # --- Extract Detailed Error Information ---
    exception = context.get('exception')
    error_message = str(exception) if exception else "Unknown error"
    error_type = type(exception).__name__ if exception else "Unknown"
    tb_str = "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
-    # Find the specific task that failed
+    # Find the specific task that failed to pull its XComs
    dag_run = context['dag_run']
    failed_task_id = "unknown"
-    # Look at direct upstream tasks of the current task ('handle_failure')
+    upstream_tasks = ti.task.get_direct_relatives(upstream=True)
    upstream_tasks = ti.get_direct_relatives(upstream=True)
    for task in upstream_tasks:
        upstream_ti = dag_run.get_task_instance(task_id=task.task_id)
        if upstream_ti and upstream_ti.state == 'failed':
            failed_task_id = task.task_id
            break
    error_details = None
    if failed_task_id != "unknown":
        error_details = ti.xcom_pull(task_ids=failed_task_id, key='error_details')
    if error_details:
        error_message = error_details.get('error_message', 'Unknown error from XCom')
        error_type = error_details.get('error_type', 'Unknown type from XCom')
        tb_str = error_details.get('traceback', 'No traceback in XCom.')
    else:
        error_message = str(exception) if exception else "Unknown error"
        error_type = type(exception).__name__ if exception else "Unknown"
        tb_str = "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
    logger.info(f"Handling failure for URL: {url}")
    logger.error(f"  Failed Task: {failed_task_id}")
    logger.error(f"  Failure Type: {error_type}")
    logger.error(f"  Failure Reason: {error_message}")
    logger.debug(f"  Traceback:\n{tb_str}")
    # --- Check for specific requeue conditions ---
    if not should_requeue: # Only check specific conditions if the general one is false
        if requeue_on_bannable_error and isinstance(exception, PBServiceException):
            bannable_error_codes = [
                "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED",
                "SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT"
            ]
            if hasattr(exception, 'errorCode') and exception.errorCode in bannable_error_codes:
                should_requeue = True
                logger.info(f"Bannable error '{exception.errorCode}' detected. Re-queuing URL as per 'requeue_on_bannable_error' param.")
        if requeue_on_ffprobe_failure and isinstance(exception, AirflowException) and "Bash command failed" in str(exception):
            # Check for the specific exit code for probe failure
            if "exit code 2" in str(exception):
                should_requeue = True
                logger.info("Probe failure detected (exit code 2). Re-queuing URL as per 'requeue_on_ffprobe_failure' param.")
    try:
        client = _get_redis_client(redis_conn_id)
-        if requeue_on_failure:
+        if should_requeue:
            client.rpush(inbox_queue, url)
            logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.")
        else:
@ -190,8 +259,9 @@ def handle_failure(**context):
        if exception:
            raise exception
        else:
-            # If for some reason there's no exception, fail explicitly.
+            # If we got details from XCom, we don't have the original exception object.
-            raise AirflowException("Failing task as per stop_on_failure=True, but original exception was not found.")
+            # So, we raise a new AirflowException with the details we have.
            raise AirflowException(f"Failing task as per stop_on_failure=True. Upstream error: [{error_type}] {error_message}")
 # --- YtdlpOpsOperator ---
@ -232,12 +302,11 @@ class YtdlpOpsOperator(BaseOperator):
        transport = None
        ti = context['task_instance']
-        try:
+        # Define connection parameters outside the try block to be available in except blocks
        params = context['params']
        url = params.get('url')
        if not url:
            raise AirflowException("DAG was triggered without a 'url' in its configuration.")
            logger.info(f"Processing URL from DAG run config: {url}")
        service_ip = self.render_template(self.service_ip, context)
        service_port_rendered = self.render_template(self.service_port, context)
@ -248,6 +317,7 @@ class YtdlpOpsOperator(BaseOperator):
        host = params.get('service_ip', service_ip)
        port_str = params.get('service_port', service_port_rendered)
        account_id = params.get('account_id', account_id)
        clients = params.get('clients')
        logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}")
@ -264,6 +334,8 @@ class YtdlpOpsOperator(BaseOperator):
        except (ValueError, TypeError):
            timeout = DEFAULT_TIMEOUT
        try:
            logger.info(f"Processing URL from DAG run config: {url}")
            socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET)
            socket_conn.setTimeout(timeout * 1000)
            transport = TTransport.TFramedTransport(socket_conn)
@ -278,7 +350,8 @@ class YtdlpOpsOperator(BaseOperator):
            token_data = client.getOrRefreshToken(
                accountId=account_id,
                updateType=TokenUpdateMode.AUTO,
-                url=url
+                url=url,
                clients=clients
            )
            logger.info("Successfully retrieved token data from service.")
@ -302,9 +375,54 @@ class YtdlpOpsOperator(BaseOperator):
            ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
            ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd)
        except (PBServiceException, TTransportException) as e:
            # Enhanced logging to make failures clear in Airflow logs.
            logger.error(f"Thrift call failed for URL '{url}' with account '{account_id}'.")
            logger.error(f"Exception Type: {type(e).__name__}")
            logger.error(f"Exception Message: {getattr(e, 'message', str(e))}")
            if isinstance(e, PBServiceException):
                logger.error(f"Service Error Code: {getattr(e, 'errorCode', 'N/A')}")
                if hasattr(e, 'context') and e.context:
                    logger.error(f"Service Context: {e.context}")
            # Use exc_info=True to get the full traceback in the logs
            logger.error("Full exception traceback:", exc_info=True)
            # Push exception details to XCom for the failure handler
            error_details = {
                'error_message': getattr(e, 'message', str(e)),
                'error_type': type(e).__name__,
                'traceback': traceback.format_exc()
            }
            ti.xcom_push(key='error_details', value=error_details)
            proxy_to_ban = None
            if isinstance(e, PBServiceException) and hasattr(e, 'context') and e.context:
                # Assuming server adds 'proxy_url' to context on failure
                proxy_to_ban = e.context.get('proxy_url')
                bannable_error_codes = [
                    "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED",
                    "SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT"
                ]
                if e.errorCode not in bannable_error_codes:
                    proxy_to_ban = None
            if proxy_to_ban:
                logger.info(f"Found proxy to ban: {proxy_to_ban}. Pushing to XCom for 'mark_proxy_banned' task.")
                ti.xcom_push(key='proxy_to_ban', value=proxy_to_ban)
                ti.xcom_push(key='server_identity_for_ban', value=account_id)
                ti.xcom_push(key='service_host_for_ban', value=host)
                ti.xcom_push(key='service_port_for_ban', value=port)
            else:
                logger.info("No specific proxy to ban based on the error context.")
                # Push None explicitly so the downstream task knows not to run
                ti.xcom_push(key='proxy_to_ban', value=None)
            # Re-raise the original exception to fail the Airflow task
            raise e
        except Exception as e:
-            logger.error(f"YtdlpOpsOperator (Worker) failed: {e}", exc_info=True)
+            logger.error(f"YtdlpOpsOperator (Worker) failed with an unexpected exception: {e}", exc_info=True)
-            raise AirflowException(f"Task failed: {e}")
+            raise AirflowException(f"Task failed with unexpected error: {e}")
        finally:
            if transport and transport.isOpen():
                transport.close()
@ -344,13 +462,13 @@ default_args = {
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
-    'retries': 1,
+    'retries': 0,
    'retry_delay': timedelta(minutes=1),
    'start_date': days_ago(1),
 }
 with DAG(
-    dag_id='ytdlp_worker_per_url',
+    dag_id='ytdlp_ops_worker_per_url',
    default_args=default_args,
    schedule_interval=None,
    catchup=False,
@ -366,12 +484,16 @@ with DAG(
        'service_ip': Param('89.253.221.173', type="string", description="Service IP."),
        'service_port': Param(9090, type="integer", description="Service port."),
        'account_id': Param('default_account', type="string", description="Account ID for the API call."),
        'clients': Param('ios', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."),
        'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
        'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."),
        'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."),
        'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json."),
        'requeue_on_failure': Param(False, type="boolean", description="If True, re-adds the URL to the inbox on failure instead of moving to the fail hash."),
        'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."),
        'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."),
        'requeue_on_bannable_error': Param(False, type="boolean", description="If True, re-queues the URL if a bannable error (proxy, bot detection) occurs."),
        'requeue_on_ffprobe_failure': Param(False, type="boolean", description="If True, re-queues the URL if the ffmpeg/ffprobe check fails."),
    }
 ) as dag:
@ -382,12 +504,13 @@ with DAG(
        account_id="{{ params.account_id }}",
        timeout="{{ params.timeout }}",
        info_json_dir="{{ params.info_json_dir }}",
        retries=0,
    )
-    download_video = BashOperator(
+    download_and_probe = BashOperator(
-        task_id='download_video',
+        task_id='download_and_probe',
        bash_command="""
            set -e
            INFO_JSON_PATH="{{ ti.xcom_pull(task_ids='get_token', key='info_json_path') }}"
            PROXY="{{ ti.xcom_pull(task_ids='get_token', key='socks_proxy') }}"
            FORMAT="{{ params.download_format }}"
@ -395,7 +518,7 @@ with DAG(
            FILENAME_TEMPLATE="{{ params.output_path_template }}"
            FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE"
-            echo "Starting download..."
+            echo "--- Starting Download Step ---"
            echo "Info JSON Path: $INFO_JSON_PATH"
            echo "Proxy: $PROXY"
            echo "Format: $FORMAT"
@ -412,10 +535,9 @@ with DAG(
                CMD_ARRAY+=(--proxy "$PROXY")
            fi
            CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename)
-            CMD_ARRAY+=(--no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
+            CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
-            printf "Executing: %q " "${CMD_ARRAY[@]}"
+            echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")"
            echo ""
            FINAL_FILENAME=$("${CMD_ARRAY[@]}")
            EXIT_CODE=$?
@ -430,17 +552,64 @@ with DAG(
                echo "Error: Download failed or did not produce a file."
                exit 1
            fi
-            echo "SUCCESS: Final file confirmed at: $FINAL_FILENAME"
+            echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME"
            echo "--- Starting Probe Step ---"
            echo "Probing downloaded file: $FINAL_FILENAME"
            if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
                echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt."
                if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then
                    echo "Attempting one retry on probe failure..."
                    echo "Renaming to .part to attempt resuming download."
                    mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part"
                    # Re-run download command
                    echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")"
                    FINAL_FILENAME=$("${CMD_ARRAY[@]}")
                    EXIT_CODE=$?
                    echo "yt-dlp retry exited with code: $EXIT_CODE"
                    if [ $EXIT_CODE -ne 0 ]; then
                        echo "Error: yt-dlp retry command failed."
                        exit $EXIT_CODE
                    fi
                    if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
                        echo "Error: Retry download failed or did not produce a file."
                        exit 1
                    fi
                    echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME"
                    # Re-probe
                    echo "Probing redownloaded file: $FINAL_FILENAME"
                    if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
                        echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2."
                        exit 2
                    fi
                else
                    echo "Failing with exit code 2 due to probe failure (retries disabled)."
                    exit 2
                fi
            fi
            echo "SUCCESS: Probe confirmed valid media file."
            # Push the final filename for the success_task
            echo "$FINAL_FILENAME"
        """,
-        retries=3,
+        retries=0, # Retries are now handled inside the script based on a DAG param
-        retry_delay=timedelta(minutes=2),
+        retry_delay=timedelta(minutes=1),
    )
    mark_proxy_banned = PythonOperator(
        task_id='mark_proxy_banned',
        python_callable=mark_proxy_banned_callable,
        trigger_rule='one_failed', # Run only if get_token fails
    )
    # This task triggers the sensor DAG to check for more work as soon as this worker is done.
    trigger_sensor_for_next_batch = TriggerDagRunOperator(
        task_id='trigger_sensor_for_next_batch',
-        trigger_dag_id='ytdlp_sensor_redis_queue',
+        trigger_dag_id='ytdlp_ops_sensor_queue',
        # Pass only the sensor's needed parameters back to it.
        # These values were originally passed from the sensor to this worker.
        # The values are templated and will be passed as strings to the triggered DAG.
@ -462,25 +631,25 @@ with DAG(
    # Define success and failure handling tasks
    success_task = PythonOperator(
-        task_id='handle_success',
+        task_id='mark_url_as_success',
-        python_callable=handle_success,
+        python_callable=mark_url_as_success,
        trigger_rule='all_success', # Run only if upstream tasks succeeded
    )
    failure_task = PythonOperator(
-        task_id='handle_failure',
+        task_id='mark_url_as_failed',
-        python_callable=handle_failure,
+        python_callable=mark_url_as_failed,
        trigger_rule='one_failed', # Run if any upstream task failed
    )
    # --- Define Task Dependencies ---
-    # The main processing flow
+    # The main success flow
-    get_token >> download_video
+    get_token >> download_and_probe >> success_task >> trigger_sensor_for_next_batch
-    # The success path: if download_video succeeds, run success_task, then trigger the next sensor run.
+    # The failure path for get_token, which includes the explicit ban task
-    download_video >> success_task >> trigger_sensor_for_next_batch
+    get_token >> mark_proxy_banned
-    # The failure path: if get_token OR download_video fails, run the failure_task.
+    # The main failure handler, which listens to the primary tasks.
-    # This is a "fan-in" dependency.
+    # If get_token or download_and_probe fails, it will trigger failure_task.
-    [get_token, download_video] >> failure_task
+    [get_token, download_and_probe] >> failure_task
--- a/dags/ytdlp_service_dag.py
+++ b/dags/ytdlp_service_dag.py
@ -1,966 +0,0 @@
 """
 DAG to deploy and manage YTDLP token service.
 This DAG handles the deployment, monitoring, and cleanup of a YTDLP token service
 for a given account. It supports both Redis-based service discovery and direct
 connection via manually specified host and port.
 Configuration Options:
 - account_id: (Required) The account ID for which the service is being deployed.
 - proxy: (Optional) The proxy to use for the service.
 - redis_enabled: (Optional, default=True) Whether to use Redis for service discovery.
  If False, you must provide `host` and `port` manually.
 - host: (Optional) The host IP of the service. Required if `redis_enabled=False`.
 - port: (Optional) The port of the service. Required if `redis_enabled=False`.
 Usage:
 1. Redis-based service discovery:
   - Set `redis_enabled=True` (default).
   - Ensure Redis is configured in Airflow connections.
   - The DAG will automatically discover the service IP and port from Redis.
 2. Manual host and port:
   - Set `redis_enabled=False`.
   - Provide `host` and `port` manually in the DAG configuration.
   - Example: {"host": "192.168.1.100", "port": 9090}.
 Example Trigger Configuration:
 {
    "account_id": "test_account",
    "proxy": "socks5://proxy.example.com:1080",
    "redis_enabled": False,
    "host": "192.168.1.100",
    "port": 9090
 }
 """
 from airflow import DAG
 from airflow.models.param import Param
 from airflow.operators.empty import EmptyOperator
 from airflow.operators.python import PythonOperator
 # HttpSensor is no longer used
 # from airflow.providers.http.sensors.http import HttpSensor
 from airflow.utils.trigger_rule import TriggerRule
 from airflow.hooks.base import BaseHook
 from airflow.exceptions import AirflowException
 from typing import Sequence # Add Sequence for type hinting
 from datetime import datetime, timedelta
 from airflow.utils.dates import days_ago # Add this import
 import uuid
 import os
 import logging
 import shutil
 import docker    
 import uuid
 import redis
 import requests
 import socket
 import time
 import sys # Import sys for maxsize
 from airflow.configuration import conf # Import conf
 # Import and apply Thrift exceptions patch
 try:
    # Always apply the patch, regardless of environment
    from thrift_exceptions_patch import patch_thrift_exceptions
    patch_thrift_exceptions()
    logging.info("Applied Thrift exceptions patch for Airflow compatibility")
    # Verify the patch was applied correctly
    try:
        from pangramia.yt.exceptions.ttypes import PBServiceException
        test_exception = PBServiceException(message="Test")
        # Try to modify attributes to verify patch works
        test_exception.args = ("Test",)
        test_exception.message = "Modified test"
        logging.info("Verified Thrift exception patch is working correctly")
    except Exception as verify_error:
        logging.error(f"Thrift exception patch verification failed: {verify_error}")
        logging.error("This may cause 'immutable instance' errors during error handling")
 except ImportError as e:
    logging.warning(f"Could not import thrift_exceptions_patch: {e}")
    logging.warning("Airflow compatibility will be affected - expect 'immutable instance' errors")
 except Exception as e:
    logging.error(f"Error applying Thrift exceptions patch: {e}")
 # Default arguments for the DAG
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0, # Disable retries for all tasks in this DAG
    'retry_delay': timedelta(minutes=5),
    # Removed 'queue': 'auth_queue' to use the default queue
    # Optional: Further filter workers by tags if using CeleryExecutor
    'executor_config': {"CeleryExecutor": {"tags": ["auth_node"]}},
 }
 def get_redis_connection(redis_host=None, redis_port=None):
    """Get a Redis connection using Airflow's Redis connection or manually specified host/port."""
    if redis_host and redis_port:
        # Use manually specified host and port
        return redis.Redis(
            host=redis_host,
            port=redis_port,
            db=0,
            decode_responses=True
        )
    else:
        # Use Airflow's Redis connection
        redis_conn = BaseHook.get_connection("redis_default")
        # Use the password from the connection if available, otherwise use 'airflow' as default
        password = redis_conn.password or 'airflow'
        return redis.Redis(
            host=redis_conn.host,  # 'redis' (service name in docker-compose)
            port=redis_conn.port,  # 6379
            password=password,
            db=0,
            decode_responses=True
        )
 def get_free_port():
    """Find and return a free port."""
    import socket
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('0.0.0.0', 0))
        return s.getsockname()[1]
 def is_port_free(p):
    """Check if a port is free to use."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        try:
            s.bind(('0.0.0.0', p))
            return True
        except OSError:
            return False
 def store_account_metadata(account_id, ip, port, proxy=None, health_port=None, container_id=None):
    """Store account metadata in Redis."""
    redis_client = get_redis_connection()
    try:
        # Verify Redis connection
        if not redis_client.ping():
            raise ConnectionError("Failed to connect to Redis")
        # Store main account metadata
        mapping = {
            "ip": ip,
            "port": str(port),
            "status": "running",
            "start_time": str(time.time())
        }
        if proxy:
            mapping["proxy"] = proxy
        if health_port:
            mapping["health_port"] = str(health_port)
        if container_id:
            mapping["container_id"] = container_id
        # Use pipeline for atomic operations
        with redis_client.pipeline() as pipe:
            # Store main metadata
            pipe.hset(f"ytdlp:{account_id}", mapping=mapping)
            # Set expiration (1 week)
            pipe.expire(f"ytdlp:{account_id}", 604800)
            # Add to account list
            pipe.sadd("ytdlp:accounts", account_id)
            # Execute all commands
            results = pipe.execute()
            # Verify all commands succeeded
            if not all(results):
                raise RuntimeError(f"Failed to store metadata for {account_id}. Pipeline results: {results}")
        # Verify the data was actually stored
        stored_data = redis_client.hgetall(f"ytdlp:{account_id}")
        if not stored_data:
            raise RuntimeError(f"Failed to verify stored data for {account_id}")
        logging.info(f"Successfully stored account metadata for {account_id} in Redis: {stored_data}")
        return True
    except Exception as e:
        logging.error(f"Failed to store account metadata for {account_id}: {e}", exc_info=True)
        # Attempt cleanup if storage failed
        try:
            redis_client = get_redis_connection() # Ensure client is available
            redis_client.delete(f"ytdlp:{account_id}")
            redis_client.srem("ytdlp:accounts", account_id)
        except Exception as cleanup_error:
            logging.error(f"Failed to cleanup failed storage for {account_id}: {cleanup_error}")
        raise
 # Removed get_account_metadata function as the service now handles Redis registration checks.
 def prepare_and_deploy_service(**context):
    """Prepare deployment and deploy the Docker service."""
    # Retrieve account_id, proxy, clients, and other parameters from DAG run configuration (conf)
    # Set default values for account_id, proxy, and redis_enabled
    account_id = context['dag_run'].conf.get('account_id') or context['params'].get('account_id', 'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09')
    proxy = context['dag_run'].conf.get('proxy') or context['params'].get('proxy', 'socks5://sslocal-rust-1084:1084')
    clients = context['dag_run'].conf.get('clients') or context['params'].get('clients', 'ios,android,mweb')
    redis_enabled = context['dag_run'].conf.get('redis_enabled', False) # Default to False
    host_param = context['dag_run'].conf.get('host') # Host parameter from config
    port_param = context['dag_run'].conf.get('port') # Port parameter from config
    docker_network = context['dag_run'].conf.get('docker_network') or context['params'].get('docker_network', 'airflow_prod_proxynet')
    host_external_ip_env = os.getenv('HOST_EXTERNAL_IP') # Explicit external IP from environment
    if not account_id:
        raise ValueError("Account ID is missing.")
    # --- Port Determination ---
    # Assign a free port if not provided, or validate the provided one
    if not port_param:
        port = get_free_port()
        if not is_port_free(port):
            raise ValueError(f"Assigned port {port} is already in use")
        logging.info(f"No port provided, assigned free port: {port}")
    else:
        port = int(port_param)
        if not is_port_free(port):
            raise ValueError(f"Provided port {port} is already in use")
        logging.info(f"Using provided port: {port}")
    # Determine health port
    health_port = port + 1
    if not is_port_free(health_port):
        raise ValueError(f"Health port {health_port} (derived from port {port}) is already in use")
    logging.info(f"Using health port: {health_port}")
    # --- Host Determination ---
    # host_for_registration: IP/Host for client discovery (Redis/Logs)
    # host_for_sensor: Hostname/IP for Airflow HttpSensor health check
    host_for_registration = host_param # Start with the parameter value
    if redis_enabled:
        # If Redis is enabled, registration host should ideally be externally reachable
        if not host_for_registration:
            host_for_registration = host_external_ip_env # Use external IP from env var if available
            if not host_for_registration:
                # If no env var, try fetching external IP using requests
                try:
                    logging.info("HOST_EXTERNAL_IP not set. Attempting to fetch external IP from api.ipify.org...")
                    response = requests.get('https://api.ipify.org', timeout=10) # 10 second timeout
                    response.raise_for_status() # Raise exception for bad status codes
                    host_for_registration = response.text.strip()
                    if not host_for_registration: # Check if response was empty
                         raise ValueError("Received empty response from api.ipify.org")
                    logging.info(f"Successfully fetched external IP: {host_for_registration}")
                except requests.exceptions.RequestException as e:
                    logging.warning(f"Failed to fetch external IP: {e}. Falling back to Docker bridge IP.")
                    # Fallback to default Docker bridge IP if fetching fails
                    host_for_registration = "172.17.0.1"
                    logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
                except Exception as e:
                    logging.error(f"Unexpected error fetching external IP: {e}. Falling back to Docker bridge IP.")
                    host_for_registration = "172.17.0.1"
                    logging.warning(f"Defaulting registration host to Docker bridge IP: {host_for_registration}. Ensure clients can reach this IP.")
            else:
                logging.info(f"Redis enabled. Using HOST_EXTERNAL_IP environment variable for registration: {host_for_registration}")
        else:
             logging.info(f"Redis enabled. Using provided host parameter for registration: {host_for_registration}")
    else: # Redis disabled
        # If Redis is disabled, registration host defaults to 0.0.0.0 if not provided
        if not host_for_registration:
            host_for_registration = "0.0.0.0"
            logging.warning(f"Redis disabled and no host param provided. Defaulting registration host to {host_for_registration}.")
        else:
            logging.info(f"Redis disabled. Using provided host parameter for registration: {host_for_registration}")
    # host_for_sensor determination will happen *after* container creation, using container name.
    logging.info(f"Preparing deployment for account {account_id}. Registration Host: {host_for_registration}, Port: {port}, Health Port: {health_port}")
    # Generate unique work ID and context directory
    work_id = str(uuid.uuid4())
    context['task_instance'].xcom_push(key='work_id', value=work_id)
    context_dir = os.path.join(os.getenv('AIRFLOW_HOME', '/tmp'), 'service-data', work_id, 'context-data')
    os.makedirs(context_dir, exist_ok=True, mode=0o777)
    os.chmod(context_dir, 0o777)
    # Push context directory and account details to XCom
    context['task_instance'].xcom_push(key='context_dir', value=context_dir)
    context['task_instance'].xcom_push(key='account_id', value=account_id)
    # Deploy the Docker service
    # The 'host_for_registration' variable here represents the externally accessible IP for registration/XCom.
    # The service inside the container will listen on 0.0.0.0.
    logging.info(f"Deploying service for account {account_id}. Registration Host: {host_for_registration}, Port: {port}")
    # Get Redis connection details ONLY if redis_enabled (for the container to register itself)
    redis_host_for_container = ''
    redis_port_for_container = ''
    if redis_enabled:
        try:
            # Get connection details to pass to the container environment
            redis_conn_details = get_redis_connection().connection_pool.connection_kwargs
            redis_host_for_container = os.getenv('REDIS_HOST', redis_conn_details.get('host', 'redis'))
            redis_port_for_container = str(os.getenv('REDIS_PORT', redis_conn_details.get('port', 6379)))
            logging.info(f"Redis enabled. Passing REDIS_HOST={redis_host_for_container}, REDIS_PORT={redis_port_for_container} to container.")
        except Exception as e:
            logging.error(f"Failed to get Redis connection details for container environment: {e}")
            logging.warning("Proceeding without Redis details in container environment due to error.")
            # Depending on container requirements, you might want to raise an error here instead
    else:
        logging.info("Redis disabled. Not passing REDIS_HOST/REDIS_PORT to container environment.")
    # Get Docker connection details from Airflow
    try:
        secrets_backend = conf.get('secrets', 'backend', fallback='None')
        logging.info(f"Attempting to get 'docker_hub' connection. Configured secrets backend: {secrets_backend}")
        docker_conn = BaseHook.get_connection("docker_hub")
        docker_username = docker_conn.login
        docker_password = docker_conn.password
        logging.info("Successfully retrieved 'docker_hub' connection.")
    except Exception as e:
        logging.error(f"Failed to retrieve 'docker_hub' connection: {e}")
        # Log details about potential secrets backend issues
        secrets_backend_kwargs = conf.get('secrets', 'backend_kwargs', fallback='{}')
        logging.error(f"Secrets backend details: backend={secrets_backend}, kwargs={secrets_backend_kwargs}")
        # Re-raise the exception to fail the task
        raise
    try:
        # Initialize Docker client to connect to docker-socket-proxy
        client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
        # Authenticate with Docker Hub
        client.login(
            username=docker_username,
            password=docker_password,
            registry=docker_conn.host  # Typically "https://index.docker.io/v1/"
        )
        # Generate a unique container name
        container_name = f"ytdlp_service_{account_id}_{uuid.uuid4().hex[:8]}"
        # Pull the Docker image (if not already present)
        client.images.pull('pangramia/ytdlp-ops-server:latest')
        # Use the configured network name (from params or default)
        network_name = docker_network # Use the retrieved parameter
        logging.info(f"Attempting to run container on network: {network_name}")
        # Determine if --probe flag should be added based on DAG param
        exit_on_proxy_fail = context['dag_run'].conf.get('exit_on_proxy_fail', True) # Default to True if not set
        command_args = [
            '--script-dir', '/app/scripts',
            '--context-dir', '/app/context-data', # Use the bind mount target inside container
            '--port', str(port),
            '--health-port', str(health_port),
            '--clients', clients,
            '--timeout', '120',
            '--proxy', proxy if proxy else ''
        ]
        if exit_on_proxy_fail:
            command_args.append('--probe')
            logging.info("Adding --probe flag to container command as exit_on_proxy_fail=True")
        else:
            logging.info("Not adding --probe flag to container command as exit_on_proxy_fail=False")
        # Run the Docker container with health port
        container = client.containers.run(
            image='pangramia/ytdlp-ops-server:latest',
            command=command_args, # Use the constructed command list
            environment={
                'PYTHONUNBUFFERED': '1', # Ensure logs are not buffered
                'SERVER_PORT': str(port), # Port the service listens on *inside* the container
                'SERVER_HOST': '0.0.0.0', # Service should listen on all interfaces *inside* the container
                'ACCOUNT_ID': account_id,
                # Pass Redis details *if enabled* for the service to register itself
                'REDIS_HOST': redis_host_for_container,
                'REDIS_PORT': redis_port_for_container,
                # Pass PROXY_URL for health check access
                'PROXY_URL': proxy if proxy else '',
            },
            ports={
                f"{port}/tcp": port,
                f"{health_port}/tcp": health_port
            },
            volumes={
                context_dir: {'bind': '/app/context-data', 'mode': 'rw'}
            },
            network_mode=network_name,  # Use the specified network variable
            auto_remove=False,  # Do not auto-remove the container
            name=container_name,  # Use a unique name
            detach=True,
            tty=True,
            shm_size='256m',
            # Updated healthcheck to test external connectivity via proxy
            healthcheck={
                # Use CMD-SHELL to allow conditional logic based on PROXY_URL env var
                'test': [
                    'CMD-SHELL',
                    # Script checks if PROXY_URL is set, uses it with curl if yes, otherwise curls directly.
                    # -f: Fail silently (exit non-zero on error)
                    # --connect-timeout 10: Timeout for connection phase
                    # > /dev/null: Discard output, we only care about exit code
                    'if [ -n "$PROXY_URL" ]; then '
                    'curl -f --connect-timeout 10 -x "$PROXY_URL" https://ifconfig.co > /dev/null; '
                    'else '
                    'curl -f --connect-timeout 10 https://ifconfig.co > /dev/null; '
                    'fi'
                ],
                'interval': 30 * 1000000000,  # Check every 30 seconds (30 * 1e9 nanoseconds)
                'timeout': 15 * 1000000000,   # Timeout after 15 seconds (15 * 1e9 nanoseconds)
                'retries': 5,                 # Retry 5 times on failure
                'start_period': 15 * 1000000000 # Grace period of 15 seconds after start
            },
            # Add labels for better identification
            labels={
                'service': 'ytdlp',
                'account_id': account_id
            }
        )
        # Wait for container to be running (skip health check verification)
        start_time = time.time()
        while True:
            container.reload()
            if container.status == 'running':
                break
            if time.time() - start_time > 10:  # 10 second timeout
                raise TimeoutError("Container failed to start within 10 seconds")
            time.sleep(1)
        logging.info(f"Container started: {container.id} (health check verification skipped)")
        # Push container details immediately after creation using simplified keys
        context['task_instance'].xcom_push(key='container_id', value=container.id)
        context['task_instance'].xcom_push(key='container_name', value=container_name)
        logging.info(f"Pushed container_id={container.id} and container_name={container_name} to XCom.")
        # --- Determine Host for Sensor ---
        # Get the container's IP address on the specified network for the HttpSensor
        try:
            container.reload() # Refresh container attributes
            network_settings = container.attrs.get('NetworkSettings', {}).get('Networks', {})
            if network_name in network_settings:
                host_for_sensor = network_settings[network_name].get('IPAddress')
                if not host_for_sensor:
                    raise ValueError(f"Container {container.id} has no IPAddress on network '{network_name}'")
                logging.info(f"Using container IP '{host_for_sensor}' on network '{network_name}' for HttpSensor.")
            else:
                # Fallback or error if container not on expected network
                logging.error(f"Container {container.id} is not attached to the expected network '{network_name}'. Network settings: {network_settings}")
                # Option 1: Fallback to container name (might fail as observed)
                # host_for_sensor = container_name
                # logging.warning(f"Falling back to container name '{host_for_sensor}' for sensor.")
                # Option 2: Raise error
                raise ValueError(f"Container {container.id} not found on network '{network_name}'. Cannot determine IP for sensor.")
        except Exception as e:
            logging.error(f"Failed to get container IP address: {e}", exc_info=True)
            raise AirflowException(f"Failed to determine IP address for HttpSensor: {e}")
        # Ensure we don't use 0.0.0.0 or empty string for the sensor
        if not host_for_sensor or host_for_sensor == "0.0.0.0":
             raise ValueError(f"Determined host_for_sensor is invalid ('{host_for_sensor}'). Check container network attachment and IP assignment.")
        # --- Add extra logging before pushing ---
        logging.info(f"FINAL CHECK before XCom push:")
        logging.info(f"  Account ID: {account_id}")
        logging.info(f"  Host for Sensor (IP Address): {host_for_sensor}")
        logging.info(f"  Host for Registration: {host_for_registration}")
        logging.info(f"  Service Port: {port}")
        logging.info(f"  Health Port: {health_port}")
        logging.info(f"  Pushing to XCom key: service_host with value: {host_for_sensor}")
        # --- End extra logging ---
        # Push distinct service connection details using simplified keys
        context['task_instance'].xcom_push(key='service_host_registration', value=host_for_registration) # For client discovery (e.g., Redis)
        context['task_instance'].xcom_push(key='service_host', value=host_for_sensor) # IP Address for HttpSensor
        context['task_instance'].xcom_push(key='service_port', value=port) # Port is the same
        context['task_instance'].xcom_push(key='service_health_port', value=health_port) # Health port is the same
        logging.info(f"Pushed host_for_sensor (IP Address)={host_for_sensor} to XCom key 'service_host'")
        logging.info(f"Pushed host_for_registration={host_for_registration} to XCom key 'service_host_registration'")
        # Store account metadata in Redis only if redis_enabled is True
        # This uses the 'host_for_registration' for client discovery
        if redis_enabled:
            store_account_metadata(account_id, host_for_registration, port, proxy, health_port, container.id)
        # If we reach here, deployment is considered successful for now
        logging.info("Deployment preparation successful.")
        # Return values are implicitly pushed to XCom (but we pushed explicitly above)
        return context_dir, host_for_registration, port
    except Exception as e:
        logging.error(f"Error during service deployment: {e}", exc_info=True)
        # Attempt to cleanup the container if it was created before the error
        try:
            if 'container' in locals() and container and container.id:
                logging.warning(f"Attempting to stop and remove container {container.id} due to deployment error.")
                container.stop(timeout=5)
                container.remove(force=True)
                logging.info(f"Successfully stopped and removed container {container.id} after error.")
            elif 'container_name' in locals() and container_name:
                 # Try finding by name if ID wasn't captured
                 containers = client.containers.list(filters={'name': container_name})
                 if containers:
                     logging.warning(f"Attempting to stop and remove container {containers[0].name} by name due to deployment error.")
                     containers[0].stop(timeout=5)
                     containers[0].remove(force=True)
                     logging.info(f"Successfully stopped and removed container {containers[0].name} after error.")
        except Exception as cleanup_err:
            logging.error(f"Failed during post-error container cleanup: {cleanup_err}")
        raise # Re-raise the original exception to fail the task
 # Removed the old monitor_health PythonOperator
 # stop_service and cleanup_service are now defined directly in the DAG below.
 def check_service_health(ti=None, **context):
    """
    Periodically checks the service's /health endpoint using requests.
    Acts as a long-running sentinel task. Fails if the health check fails
    repeatedly or times out.
    """
    # Get parameters from XCom
    host_reg = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host_registration')
    host_svc = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_host')
    health_port = ti.xcom_pull(task_ids='prepare_and_deploy', key='service_health_port')
    # Determine the host to use (prioritize registration host)
    host = host_reg if host_reg and host_reg != '0.0.0.0' else host_svc
    if not host or not health_port:
        raise AirflowException("Could not retrieve host or health_port from XCom for health check.")
    health_url = f"http://{host}:{health_port}/health"
    logging.info(f"Starting health check for: {health_url}")
    # Get configuration for polling
    # Use task's execution_timeout if available, otherwise default to 1 year
    task_timeout = ti.task.execution_timeout or timedelta(days=365)
    poke_interval = 60 # Check every 60 seconds (adjust as needed)
    start_time = time.monotonic()
    timeout_seconds = task_timeout.total_seconds()
    consecutive_error_start_time = None # Track start time of consecutive connection errors
    error_retry_window = 10 # Seconds to retry connection errors before failing
    while True:
        current_time = time.monotonic()
        if current_time - start_time > timeout_seconds:
            raise AirflowException(f"Health check timed out after {timeout_seconds} seconds for {health_url}")
        try:
            # Use a reasonable timeout for the individual request
            response = requests.get(health_url, timeout=15) # 15 second request timeout
            response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
            # Check response content if needed (optional)
            # Example: Check for specific JSON content
            # try:
            #     data = response.json()
            #     if data.get("status") == "healthy":
            #         logging.info(f"Health check successful: Status {response.status_code}")
            #     else:
            #         logging.warning(f"Health check OK (Status {response.status_code}), but content unexpected: {data}")
            # except requests.exceptions.JSONDecodeError:
            #      logging.warning(f"Health check OK (Status {response.status_code}), but response is not valid JSON.")
            # If we got a 2xx status, log success and reset error timer if needed
            if consecutive_error_start_time is not None:
                logging.info(f"Connection to {health_url} recovered.")
                consecutive_error_start_time = None
            logging.info(f"Health check successful: Status {response.status_code} for {health_url}")
        except requests.exceptions.Timeout:
            current_monotonic_time = time.monotonic()
            if consecutive_error_start_time is None:
                consecutive_error_start_time = current_monotonic_time
                logging.warning(f"Health check request timed out for {health_url}. Starting {error_retry_window}s retry window...")
            else:
                elapsed_error_time = current_monotonic_time - consecutive_error_start_time
                if elapsed_error_time > error_retry_window:
                    error_msg = f"Health check failed for {health_url}: Timeout persisted for over {error_retry_window} seconds."
                    logging.error(error_msg)
                    raise AirflowException(error_msg)
                else:
                    logging.warning(f"Health check request timed out for {health_url}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
        except requests.exceptions.ConnectionError as e:
            # Check if the error is specifically "Connection refused" - fail immediately
            if "[Errno 111] Connection refused" in str(e):
                 logging.error(f"Health check failed for {health_url}: Connection refused. Failing task immediately.")
                 raise AirflowException(f"Health check failed for {health_url}: Connection refused")
            else:
                 # Handle other connection errors with the retry window
                 current_monotonic_time = time.monotonic()
                 if consecutive_error_start_time is None:
                     consecutive_error_start_time = current_monotonic_time
                     logging.warning(f"Health check connection error for {health_url}: {e}. Starting {error_retry_window}s retry window...")
                 else:
                     elapsed_error_time = current_monotonic_time - consecutive_error_start_time
                     if elapsed_error_time > error_retry_window:
                         error_msg = f"Health check failed for {health_url}: Connection error persisted for over {error_retry_window} seconds. Last error: {e}"
                         logging.error(error_msg)
                         raise AirflowException(error_msg)
                     else:
                         logging.warning(f"Health check connection error for {health_url}: {e}. Retrying within {error_retry_window}s window ({elapsed_error_time:.1f}s elapsed)...")
        except requests.exceptions.HTTPError as e:
            # This catches 4xx/5xx errors - fail immediately
            logging.error(f"Health check failed for {health_url}: Status {e.response.status_code}. Failing task.")
            # Fail the task immediately on HTTP error
            raise AirflowException(f"Health check failed for {health_url}: Status {e.response.status_code}")
        except requests.exceptions.RequestException as e:
            logging.error(f"Health check failed for {health_url} with unexpected error: {e}. Failing task.")
            # Fail the task immediately on other request errors
            raise AirflowException(f"Health check failed for {health_url}: {e}")
        except Exception as e:
            # Catch any other unexpected errors during the check
            logging.error(f"Unexpected error during health check for {health_url}: {e}", exc_info=True)
            raise AirflowException(f"Unexpected error during health check: {e}")
        # Wait for the poke interval before the next check
        time.sleep(poke_interval)
 def _wait_forever():
    """Sleeps indefinitely (or until task timeout) to simulate a running service."""
    logging.info("Sentinel task started. Sleeping in a loop...")
    # Sleep in a loop with a reasonable interval to avoid OverflowError
    # The task will keep running until it times out based on execution_timeout
    # or is manually stopped/failed.
    while True:
        try:
            # Sleep for a long interval (e.g., 1 day)
            # You can adjust this interval if needed.
            time.sleep(86400) # Sleep for 24 hours
        except KeyboardInterrupt:
            logging.info("Sentinel task interrupted. Exiting.")
            break
        except Exception as e:
            # Log other potential errors during sleep, though unlikely
            logging.error(f"Error during sentinel sleep loop: {e}")
            # Optionally break or continue based on error handling strategy
            break # Exit loop on unexpected error
 def stop_service(**context):
    """Stop the running Docker container with verification."""
    # Retrieve account_id from params or kwargs
    account_id = context.get('params', {}).get('account_id') or context.get('account_id')
    if not account_id:
        raise ValueError("Account ID is missing.")
    # Initialize Docker client to connect to docker-socket-proxy
    client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
    try:
        # For testing, try to get container ID from environment if XCom is not available
        container_id = None
        if 'ti' in context:
            # Use simplified XCom key
            container_id = context['ti'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
        if not container_id:
            # If not found in XCom, try to find container by account_id pattern (keep this fallback)
            containers = client.containers.list(filters={"name": f"ytdlp_service_{account_id}"})
            if containers:
                container = containers[0]
                container_id = container.id
                logging.info(f"Found container by name pattern: {container.name} (ID: {container_id})")
            else:
                logging.warning(f"No container found for account {account_id} - nothing to stop")
                return
        if container_id:
            # If found in XCom, stop by container ID
            container = client.containers.get(container_id)
            # Verify container is running before stopping
            if container.status != 'running':
                logging.warning(f"Container {container_id} is not running (status: {container.status})")
                return
            logging.info(f"Stopping container {container_id}...")
            container.stop(timeout=10)  # 10 second timeout
            # Verify container is stopped
            container.reload()
            if container.status == 'exited':
                logging.info(f"Successfully stopped container {container_id}")
            else:
                logging.error(f"Container {container_id} failed to stop (status: {container.status})")
                raise RuntimeError(f"Container {container_id} failed to stop")
            # Clear Redis entries only if redis_enabled is True
            # Retrieve redis_enabled status from DAG run conf or params
            redis_enabled = context['dag_run'].conf.get('redis_enabled', False) or context['params'].get('redis_enabled', False)
            if redis_enabled:
                redis_client = get_redis_connection()
                try:
                    # Verify Redis connection
                    if not redis_client.ping():
                        raise ConnectionError("Failed to connect to Redis")
                    # Remove main metadata
                    redis_client.delete(f"ytdlp:{account_id}")
                    # Remove from accounts set
                    redis_client.srem("ytdlp:accounts", account_id)
                    logging.info(f"Successfully cleared Redis entries for account: {account_id}")
                except Exception as e:
                    logging.error(f"Failed to clear Redis entries for account {account_id}: {e}")
                    # Do not raise here, allow container stop to be considered successful
                    # raise # Optional: re-raise if Redis cleanup failure should fail the task
            return
        logging.warning(f"No container found for account {account_id} - nothing to stop")
    except docker.errors.NotFound as e:
        logging.warning(f"Container for account {account_id} not found: {e}")
    except Exception as e:
        logging.error(f"Failed to stop container: {e}")
        raise
 def cleanup_service(**context):
    """Cleanup service resources including Redis entries and XCom data."""
    # Note: This function is now called within the manual_stop_cleanup TaskGroup
    try:
        # Retrieve account_id from params first, then from XCom
        account_id = context['params'].get('account_id')
        if not account_id:
            # Try to get it from XCom
            account_id = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='account_id')
        if not account_id:
            logging.warning("Account ID not found in params or XCom - skipping resource cleanup")
            return
        # Redis cleanup (if redis_enabled=True) is handled in the 'stop_service' task.
        logging.info(f"Redis cleanup for account {account_id} is handled by the 'stop_service' task if enabled.")
        # Cleanup XCom data (using simplified keys where applicable)
        # Note: XCom cleanup is generally not strictly necessary but can be good practice.
        # Airflow manages XCom expiry. This code doesn't actually *delete* XComs.
        # To truly delete, you'd use the Airflow API or DB directly.
        # We'll leave the pull calls here as they don't harm anything.
        ti = context['task_instance']
        ti.xcom_pull(key='container_id', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='container_name', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='service_host_registration', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='service_host', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='service_port', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='service_health_port', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='work_id', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='context_dir', task_ids='prepare_and_deploy', include_prior_dates=True)
        ti.xcom_pull(key='account_id', task_ids='prepare_and_deploy', include_prior_dates=True) # Keep account_id pull
        logging.info(f"Pulled XCom data for potential cleanup logging for account: {account_id}")
        # Initialize Docker client
        client = docker.DockerClient(base_url='tcp://docker-socket-proxy:2375')
        container_found_and_removed = False
        # Attempt 1: Get container ID from XCom using simplified key
        container_id_xcom = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='container_id')
        if container_id_xcom:
            logging.info(f"Attempting to remove container using XCom ID: {container_id_xcom}")
            try:
                container = client.containers.get(container_id_xcom)
                logging.info(f"Found container {container.id} (Name: {container.name}). Removing...")
                container.remove(force=True)
                logging.info(f"Successfully removed container {container.id}")
                container_found_and_removed = True
            except docker.errors.NotFound:
                logging.warning(f"Container with XCom ID {container_id_xcom} not found. Trying other methods.")
            except Exception as e:
                logging.error(f"Error removing container {container_id_xcom}: {e}")
        # Attempt 2: Find container by labels if not found/removed via XCom ID
        if not container_found_and_removed:
            logging.info(f"Attempting to find and remove container by labels: service=ytdlp, account_id={account_id}")
            try:
                containers = client.containers.list(
                    filters={'label': [f'service=ytdlp', f'account_id={account_id}']},
                    all=True # Include stopped containers
                )
                if containers:
                    for container in containers:
                        logging.info(f"Found container {container.id} (Name: {container.name}) by labels. Removing...")
                        try:
                            container.remove(force=True)
                            logging.info(f"Successfully removed container {container.id}")
                            container_found_and_removed = True # Mark as found even if only one is removed
                        except Exception as e:
                             logging.error(f"Error removing container {container.id} found by labels: {e}")
                else:
                    logging.info("No containers found matching labels.")
            except Exception as e:
                logging.error(f"Error searching for containers by labels: {e}")
        # Attempt 3: Find container by name pattern if still not found/removed
        if not container_found_and_removed:
             container_name_pattern = f"ytdlp_service_{account_id}_*"
             logging.info(f"Attempting to find and remove container by name pattern: {container_name_pattern}")
             try:
                 containers = client.containers.list(filters={'name': container_name_pattern}, all=True)
                 if containers:
                     for container in containers:
                         logging.info(f"Found container {container.id} (Name: {container.name}) by name pattern. Removing...")
                         try:
                             container.remove(force=True)
                             logging.info(f"Successfully removed container {container.id}")
                             container_found_and_removed = True
                         except Exception as e:
                             logging.error(f"Error removing container {container.id} found by name: {e}")
                 else:
                     logging.info("No containers found matching name pattern.")
             except Exception as e:
                 logging.error(f"Error searching for containers by name: {e}")
        if not container_found_and_removed:
            logging.warning(f"Could not find or remove any container for account {account_id} using ID, labels, or name.")
        # Get context directory from XCom and remove it
        context_dir = context['task_instance'].xcom_pull(task_ids='prepare_and_deploy', key='context_dir')
        if context_dir and os.path.exists(context_dir):
            shutil.rmtree(context_dir)
            logging.info(f"Cleaned up working directory: {context_dir}")
    except Exception as e:
        logging.error(f"Error during cleanup: {e}")
        raise
 # Define the DAG
 with DAG(
    'ytdlp_service',
    default_args=default_args,
    description='Deploy YTDLP token service for ios, android, mweb',
    schedule_interval=None,
    start_date=days_ago(1), # Use dynamic start date for manually triggered DAG
    catchup=False,
    tags=['youtube', 'tokens', 'service', 'docker'],
    # executor_config moved to default_args
    is_paused_upon_creation=False,
    params={
        'account_id': Param(
            'account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09',
            type="string",
            description="Required: The account ID for which the service is being deployed."
        ),
        'proxy': Param(
            'socks5://sslocal-rust-1084:1084',
            type=["null", "string"],
            description="Optional: The SOCKS5 proxy URL to use for the service (e.g., socks5://host:port)."
        ),
        'clients': Param(
            'ios,android,mweb',
            type="string",
            description="Comma-separated list of client types (e.g., ios,android,mweb)."
        ),
        'redis_enabled': Param(
            False,
            type="boolean",
            description="Use Redis for service discovery? If False, host/port must be provided or will be auto-assigned."
        ),
        'host': Param(
            None,
            type=["null", "string"],
            description="Optional: Host IP for the service. If redis_enabled=False and host is not provided, defaults to '0.0.0.0'. If redis_enabled=True and host is not provided, uses HOST_EXTERNAL_IP or defaults to '0.0.0.0'."
        ),
        'port': Param(
            None,
            type=["null", "integer"],
            description="Optional: Port for the service. If None, a free port will be assigned automatically. If redis_enabled=False and a port is provided, it will be used (after checking availability)."
        ),
        # redis_host and redis_port parameters are removed.
        # If redis_enabled=True, the DAG will use the 'redis_default' Airflow connection.
        'docker_network': Param(
            'airflow_prod_proxynet',
            type="string",
            description="Optional: The Docker network to attach the container to. Defaults to 'airflow_prod_proxynet'."
        ),
        'exit_on_proxy_fail': Param(
            True,
            type="boolean",
            description="Exit the service container immediately if the initial proxy test fails?"
        ),
    }
 ) as dag:
    # Task to prepare and deploy the service
    prepare_and_deploy = PythonOperator(
        task_id='prepare_and_deploy',
        python_callable=prepare_and_deploy_service,
        provide_context=True,
        trigger_rule='all_success' # Keep default trigger rule for prepare_and_deploy
    )
    # Combined Health Check and Sentinel Task using PythonOperator
    # This task runs for a long time, checking health periodically using the 'requests' library.
    # If the health check fails repeatedly or times out, the task fails, triggering 'stop_service'.
    monitor_service_health = PythonOperator(
        task_id='monitor_service_health',
        python_callable=check_service_health,
        provide_context=True,
        # Set execution timeout for the task itself (acts as the overall timeout)
        execution_timeout=timedelta(days=365), # Long timeout (e.g., 1 year)
        # op_kwargs can pass static config, but host/port come from XCom inside the function
        # poke_interval and request timeout are handled within check_service_health
    )
    monitor_service_health.doc_md = """
    ### Monitor Service Health Task (PythonOperator)
    Uses a Python function to periodically check the service's `/health` endpoint using the `requests` library.
    Acts as both a health check and a sentinel for the running service.
    - **Pulls from XCom:** Reads `service_host_registration`, `service_host`, and `service_health_port` from the `prepare_and_deploy` task to construct the target URL.
    - **Polling:** Checks the `/health` endpoint every 60 seconds.
    - **Timeout:** Uses the task's `execution_timeout` (set to 1 year) as the overall maximum duration. Individual requests have a 15-second timeout.
    - **Failure:** If a health check request returns a 4xx/5xx status code or encounters other request errors, the task fails immediately. If the overall `execution_timeout` is reached without a failure, the task would eventually time out and fail.
    """
    # Task to stop the service (runs if monitor_service_health fails)
    stop = PythonOperator(
        task_id='stop_service',
        python_callable=stop_service,
        provide_context=True,
        trigger_rule=TriggerRule.ONE_FAILED # Run only if monitor_service_health fails
    )
    stop.doc_md = """
    ### Stop Service Task
    Stops the Docker container associated with the service.
    - **Trigger Rule:** `one_failed` - This task only runs if the upstream `monitor_service_health` task fails.
    - Pulls container ID/name from XCom or finds it using labels/name patterns.
    - Clears Redis entries if `redis_enabled=True`.
    """
    # Marker task to indicate that the deployment failed
    prepare_failed_marker = EmptyOperator(
        task_id='prepare_failed_marker',
        trigger_rule=TriggerRule.ONE_FAILED # Run only if 'prepare_and_deploy' fails
    )
    # Task to cleanup resources (runs after stop sequence OR if prepare fails)
    cleanup = PythonOperator(
        task_id='cleanup_service',
        python_callable=cleanup_service,
        provide_context=True,
        trigger_rule=TriggerRule.ALL_DONE # Run after upstream (stop or prepare_failed_marker) is done
    )
    cleanup.doc_md = """
    ### Cleanup Service Task
    Removes the Docker container and cleans up related resources.
    - **Trigger Rule:** `all_done` - Runs after the `stop_service` task finishes, whether it succeeded or failed.
    - Removes the container using ID from XCom, labels, or name patterns.
    - Cleans up XCom variables.
    - Removes the context directory.
    """
    # Define task dependencies
    # Success Path: prepare -> monitor (runs indefinitely)
    # Monitor Failure Path: monitor (fails) -> stop -> cleanup
    # Prepare Failure Path: prepare (fails) -> prepare_failed_marker -> cleanup
    prepare_and_deploy >> monitor_service_health
    prepare_and_deploy >> prepare_failed_marker # Trigger marker if prepare fails
    monitor_service_health >> stop # Trigger stop if monitor fails
    # Cleanup is triggered after stop finishes OR after prepare_failed_marker finishes
    stop >> cleanup
    prepare_failed_marker >> cleanup
--- a/docker-compose-ytdlp-ops.yaml
+++ b/docker-compose-ytdlp-ops.yaml
@ -1,21 +1,21 @@
 version: '3.8'
 services:
  camoufox:
    build:
      context: ./camoufox # Path relative to the docker-compose file
      dockerfile: Dockerfile
      args:
        VNC_PASSWORD: ${VNC_PASSWORD:-supersecret} # Use environment variable or default
    ports:
      # Optionally expose the camoufox port to the host for debugging
      - "12345:12345"
-      - "5900:5900" # Expose VNC port to the host, still not working
+      - "5900:5900" # Expose VNC port to the host
    networks:
      - airflow_prod_proxynet
    command: [
      "--ws-host", "0.0.0.0",
      "--port", "12345",
      "--ws-path", "mypath",
-      "--proxy-url", "socks5://sslocal-rust-1082:1082",
+      "--proxy-url", "socks5://sslocal-rust-1084:1084",
      "--locale", "en-US",
      "--geoip",
      "--extensions", "/app/extensions/google_sign_in_popup_blocker-1.0.2.xpi,/app/extensions/spoof_timezone-0.3.4.xpi,/app/extensions/youtube_ad_auto_skipper-0.6.0.xpi"
@ -35,8 +35,6 @@ services:
    networks:
      - airflow_prod_proxynet
    command:
      - "--script-dir"
      - "/app/scripts"
      - "--context-dir"
      - "/app/context-data"
      - "--port"
@ -44,8 +42,8 @@ services:
      - "--clients"
      # Add 'web' client since we now have camoufox, test firstly
      - "web,ios,android,mweb"
-      - "--proxy"
+      - "--proxies"
-      - "socks5://sslocal-rust-1082:1082"
+      - "socks5://sslocal-rust-1081:1081,socks5://sslocal-rust-1082:1082,socks5://sslocal-rust-1083:1083,socks5://sslocal-rust-1084:1084,socks5://sslocal-rust-1085:1085"
      # Add the endpoint argument pointing to the camoufox service
      - "--endpoint"
      - "ws://camoufox:12345/mypath"
@ -54,6 +52,15 @@ services:
      - "--camouflage-only"
      # Add flag to print full tokens in logs by default
      - "--print-tokens"
      # Add server identity and Redis connection details
      - "--server-identity"
      - "ytdlp-ops-airflow-service"
      - "--redis-host"
      - "${REDIS_HOST:-redis}"
      - "--redis-port"
      - "${REDIS_PORT:-6379}"
      - "--redis-password"
      - "${REDIS_PASSWORD}"
    restart: unless-stopped
    pull_policy: always
--- a/ytdlp-ops-auth/requirements.txt
+++ b/ytdlp-ops-auth/requirements.txt
@ -1,8 +1,11 @@
 thrift>=0.16.0,<=0.20.0
 python-dotenv==1.0.1
 psutil
 flask
 psutil
 PySocks>=1.7.0
 python-dotenv==1.0.1
 redis>=4.0.0
 requests>=2.31.0
 tabulate>=0.9.0
 thrift>=0.16.0,<=0.20.0
 waitress
 yt_dlp>=2025.3.27
 yt-dlp-get-pot==0.3.0
 requests>=2.31.0