Updated current version of v1 and v2 dags, bin/ytops_client, ansible individual services

2025-11-17 17:18:47 +03:00 · 2025-11-17 17:18:47 +03:00 · f151ffee86
commit f151ffee86
parent 52a2d6290d
93 changed files with 15201 additions and 500 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 **/__pycache__/*
 .aider*
--- a/2
+++ b/2
@ -1 +1 @@
-3.10.1-exp
+3.11.3-exp
--- a/airflow/Dockerfile
+++ b/airflow/Dockerfile
@ -18,54 +18,95 @@ RUN apt-get update && \
        iputils-ping \
        curl \
        traceroute \
-        tcpdump && \
+        tcpdump \
        unzip \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/man /usr/share/doc /usr/share/doc-base
 # Ensure the airflow user and group exist with the correct UID/GID and permissions.
 # This is done early to allow `COPY --chown` to work correctly.
 RUN if ! getent group airflow > /dev/null 2>&1; then \
        groupadd -g 50000 airflow; \
    fi && \
    if ! id -u airflow > /dev/null 2>&1; then \
        useradd -u 50000 -g 50000 -m -s /bin/bash airflow; \
    else \
        usermod -g 50000 airflow; \
    fi && \
    chown -R airflow:airflow /app && \
    chmod -R g+w /app
 # Download and install mc (MinIO client)
 RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc && \
    chmod +x /usr/local/bin/mc
-# Download and install custom FFmpeg build from yt-dlp's recommended source
+# Install FFmpeg
 RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz" && \
    echo "Downloading FFmpeg from $FFMPEG_URL" && \
    wget -qO /tmp/ffmpeg.tar.xz "$FFMPEG_URL" && \
    mkdir -p /opt/ffmpeg && \
    tar -xf /tmp/ffmpeg.tar.xz -C /opt/ffmpeg --strip-components=1 && \
    ln -sf /opt/ffmpeg/bin/ffmpeg /usr/local/bin/ffmpeg && \
    ln -sf /opt/ffmpeg/bin/ffprobe /usr/local/bin/ffprobe && \
-    rm -rf /tmp/ffmpeg.tar.xz && \
+    rm -rf /tmp/ffmpeg.tar.xz
    ffmpeg -version
-# Check if airflow group exists, create it if it doesn't, then ensure proper setup
+# Install yt-dlp from master
-RUN if ! getent group airflow > /dev/null 2>&1; then \
+# Temporarily rename pip to bypass the root check in the base image's pip wrapper,
-        groupadd -g 1001 airflow; \
+# ensuring a system-wide installation.
-    fi && \
+RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
-    # Check if airflow user exists and is in the airflow group
+    python3 -m pip install --no-cache-dir -U pip hatchling wheel && \
-    if id -u airflow > /dev/null 2>&1; then \
+    python3 -m pip install --no-cache-dir --force-reinstall "yt-dlp[default] @ https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz" && \
-        usermod -a -G airflow airflow; \
+    chmod a+x "$(which yt-dlp)" && \
-    else \
+    mv /usr/local/bin/pip.orig /usr/local/bin/pip
        useradd -u 1003 -g 1001 -m -s /bin/bash airflow; \
    fi && \
    chown -R airflow:airflow /app && \
    chmod g+w /app
-# Switch to airflow user for package installation
+# Install Deno
-USER airflow
+RUN curl -fsSL https://github.com/denoland/deno/releases/latest/download/deno-x86_64-unknown-linux-gnu.zip -o deno.zip && \
    unzip deno.zip && mv deno /usr/local/bin/ && rm deno.zip
-# Install base Airflow dependencies
+# Install aria2c and gost
 RUN curl -fsSL https://raw.githubusercontent.com/P3TERX/aria2-builder/master/aria2-install.sh | bash
 # Install gost (direct download of binary)
 RUN wget -q https://github.com/ginuerzh/gost/releases/download/v2.12.0/gost_2.12.0_linux_amd64.tar.gz && \
    tar -xzf gost_2.12.0_linux_amd64.tar.gz -C /usr/local/bin/ && \
    rm gost_2.12.0_linux_amd64.tar.gz
 # Verify installations
 RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V
 # Create version information files
 RUN ( \
    echo "--- yt-dlp ---" && \
    yt-dlp --version && \
    echo "" && \
    echo "--- deno ---" && \
    deno --version && \
    echo "" && \
    echo "--- ffmpeg ---" && \
    ffmpeg -version | head -n 1 \
 ) > VERSION-airflow-latest.txt && \
 cp VERSION-airflow-latest.txt VERSION-airflow-$(date +%Y%m%d-%H%M%S).txt
 # Install base Airflow dependencies as root (system-wide)
 # [FIX] Explicitly install a version of botocore compatible with Python 3.12
 # to fix a RecursionError when handling S3 remote logs.
-RUN pip install --no-cache-dir \
+# Temporarily rename pip to bypass the root check in the base image's pip wrapper.
 RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
    python3 -m pip install --no-cache-dir \
    "apache-airflow==${AIRFLOW_VERSION}" \
    apache-airflow-providers-docker \
    apache-airflow-providers-http \
    apache-airflow-providers-amazon \
    "apache-airflow-providers-celery>=3.3.0" \
    apache-airflow-providers-redis \
    "botocore>=1.34.118" \
    psycopg2-binary \
    "gunicorn==20.1.0" \
    "python-ffmpeg==2.0.12" \
-    "ffprobe3"
+    "ffprobe3" \
    "python-dotenv" && \
    mv /usr/local/bin/pip.orig /usr/local/bin/pip
 # --- Install the custom yt_ops_services package ---
 # Copy all the necessary source code for the package.
@ -78,17 +119,24 @@ COPY --chown=airflow:airflow pangramia ./pangramia/
 # Install the package in editable mode. This runs setup.py and installs all dependencies
 # listed in `install_requires`, making the `yt_ops_services` module available everywhere.
-RUN pip install --no-cache-dir -e .
+# Bypass the pip root check again.
 RUN mv /usr/local/bin/pip /usr/local/bin/pip.orig && \
    python3 -m pip install --no-cache-dir -e . && \
    mv /usr/local/bin/pip.orig /usr/local/bin/pip
 # Copy token generator scripts and utils with correct permissions
 # COPY --chown=airflow:airflow generate_tokens_direct.mjs ./
 # COPY --chown=airflow:airflow utils ./utils/
 # COPY --chown=airflow:airflow token_generator ./token_generator/
-# --- Always update yt-dlp to latest nightly on container start ---
+# Ensure the home directory and all its contents are owned by the airflow user before switching to it.
-# This is done in the entrypoint so every worker run uses the freshest build
+# This fixes permission issues that can occur if previous RUN commands created files in /home/airflow as root.
-COPY --chown=airflow:airflow update-yt-dlp.sh /usr/local/bin/update-yt-dlp.sh
+# We also make it world-writable to accommodate running the container with a different user ID, which can
-RUN chmod +x /usr/local/bin/update-yt-dlp.sh
+# happen in some environments (e.g., OpenShift or with docker-compose user overrides).
 RUN chown -R airflow:airflow /home/airflow && chmod -R 777 /home/airflow
 # Switch to airflow user for all subsequent operations
 USER airflow
 # Expose bgutil plugin to worker path
 ENV PYTHONPATH=/opt/bgutil-ytdlp-pot-provider/plugin:$PYTHONPATH
--- a/airflow/Dockerfile.old
+++ b/airflow/Dockerfile.old
@ -0,0 +1,125 @@
 FROM apache/airflow:2.10.3
 ENV AIRFLOW_VERSION=2.10.3
 WORKDIR /app
 # Install system dependencies
 USER root
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        vim \
        mc \
        jq \
        build-essential \
        python3-dev \
        wget \
        tar \
        xz-utils \
        iputils-ping \
        curl \
        traceroute \
        tcpdump \
        unzip \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/man /usr/share/doc /usr/share/doc-base
 # Download and install mc (MinIO client)
 RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc && \
    chmod +x /usr/local/bin/mc
 # Install FFmpeg
 RUN FFMPEG_URL="https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz" && \
    wget -qO /tmp/ffmpeg.tar.xz "$FFMPEG_URL" && \
    mkdir -p /opt/ffmpeg && \
    tar -xf /tmp/ffmpeg.tar.xz -C /opt/ffmpeg --strip-components=1 && \
    ln -sf /opt/ffmpeg/bin/ffmpeg /usr/local/bin/ffmpeg && \
    ln -sf /opt/ffmpeg/bin/ffprobe /usr/local/bin/ffprobe && \
    rm -rf /tmp/ffmpeg.tar.xz
 # Install yt-dlp from master
 RUN python3 -m pip install -U pip hatchling wheel && \
    python3 -m pip install --force-reinstall "yt-dlp[default] @ https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz"
 # Install Deno
 RUN curl -fsSL https://github.com/denoland/deno/releases/latest/download/deno-x86_64-unknown-linux-gnu.zip -o deno.zip && \
    unzip deno.zip && mv deno /usr/local/bin/ && rm deno.zip
 # Install aria2c and gost
 RUN curl -fsSL https://raw.githubusercontent.com/P3TERX/aria2-builder/master/aria2-install.sh | bash
 # Install gost (direct download of binary)
 RUN wget -q https://github.com/ginuerzh/gost/releases/download/v2.12.0/gost_2.12.0_linux_amd64.tar.gz && \
    tar -xzf gost_2.12.0_linux_amd64.tar.gz -C /usr/local/bin/ && \
    rm gost_2.12.0_linux_amd64.tar.gz
 # Verify installations
 RUN ffmpeg -version && deno --version && yt-dlp --version && aria2c --version && gost -V
 # Check if airflow group exists, create it if it doesn't, then ensure proper setup
 RUN if ! getent group airflow > /dev/null 2>&1; then \
        groupadd -g 1001 airflow; \
    fi && \
    # Check if airflow user exists and is in the airflow group
    if id -u airflow > /dev/null 2>&1; then \
        usermod -a -G airflow airflow; \
    else \
        useradd -u 1003 -g 1001 -m -s /bin/bash airflow; \
    fi && \
    chown -R airflow:airflow /app && \
    chmod g+w /app
 # Install base Airflow dependencies
 # [FIX] Explicitly install a version of botocore compatible with Python 3.12
 # to fix a RecursionError when handling S3 remote logs.
 RUN pip install --no-cache-dir \
    "apache-airflow==${AIRFLOW_VERSION}" \
    apache-airflow-providers-docker \
    apache-airflow-providers-http \
    apache-airflow-providers-amazon \
    "botocore>=1.34.118" \
    psycopg2-binary \
    "gunicorn==20.1.0" \
    "python-ffmpeg==2.0.12" \
    "ffprobe3" \
    "python-dotenv"
 # Switch to airflow user for package installation
 USER airflow
 # --- Install the custom yt_ops_services package ---
 # Copy all the necessary source code for the package.
 # The deploy script ensures these files are in the build context.
 COPY --chown=airflow:airflow setup.py ./
 COPY --chown=airflow:airflow VERSION ./
 COPY --chown=airflow:airflow yt_ops_services ./yt_ops_services/
 COPY --chown=airflow:airflow thrift_model ./thrift_model/
 COPY --chown=airflow:airflow pangramia ./pangramia/
 # Install the package in editable mode. This runs setup.py and installs all dependencies
 # listed in `install_requires`, making the `yt_ops_services` module available everywhere.
 RUN pip install --no-cache-dir -e .
 # Copy token generator scripts and utils with correct permissions
 # COPY --chown=airflow:airflow generate_tokens_direct.mjs ./
 # COPY --chown=airflow:airflow utils ./utils/
 # COPY --chown=airflow:airflow token_generator ./token_generator/
 # Create version information files
 RUN ( \
    echo "--- yt-dlp ---" && \
    yt-dlp --version && \
    echo "" && \
    echo "--- deno ---" && \
    deno --version && \
    echo "" && \
    echo "--- ffmpeg ---" && \
    ffmpeg -version | head -n 1 \
 ) > VERSION-airflow-latest.txt && \
 cp VERSION-airflow-latest.txt VERSION-airflow-$(date +%Y%m%d-%H%M%S).txt
 # Expose bgutil plugin to worker path
 ENV PYTHONPATH=/opt/bgutil-ytdlp-pot-provider/plugin:$PYTHONPATH
--- a/airflow/camoufox/Dockerfile
+++ b/airflow/camoufox/Dockerfile
@ -62,6 +62,9 @@ RUN conda run -n camo pip install --no-cache-dir -r requirements.txt
 # Install Playwright browsers for version 1.49
 RUN conda run -n camo playwright install --with-deps
 # Pre-download and cache Camoufox to speed up startup
 RUN conda run -n camo camoufox fetch
 # Copy the server script into the image
 COPY camoufox_server.py .
--- a/airflow/config/custom_task_hooks.py
+++ b/airflow/config/custom_task_hooks.py
@ -14,7 +14,8 @@ def task_instance_mutation_hook(ti):
    to be set by the dispatcher DAG. This avoids database race conditions.
    """
    logger.debug(f"MUTATION HOOK: Running for dag '{ti.dag_id}', task '{ti.task_id}'.")
-    if ti.dag_id == 'ytdlp_ops_worker_per_url':
+    # This hook targets all worker DAGs, which follow a naming convention.
    if 'worker_per_url' in ti.dag_id:
        # If the run_id isn't populated yet, just return. The hook may be called again.
        if not ti.run_id:
            logger.debug(f"MUTATION HOOK: run_id not yet available for task '{ti.task_id}'. Skipping this invocation.")
@ -26,7 +27,8 @@ def task_instance_mutation_hook(ti):
        if ti.run_id and '_q_' in ti.run_id:
            try:
                parsed_queue = ti.run_id.split('_q_')[-1]
-                if parsed_queue.startswith('queue-dl-'):
+                # Check for valid v1 (dl) or v2 (auth/dl) queue prefixes.
                if parsed_queue.startswith(('queue-dl-', 'queue-auth-')):
                    worker_queue = parsed_queue
            except Exception as e:
                logger.error(f"MUTATION HOOK: CRITICAL: Error parsing queue from run_id '{ti.run_id}': {e}.", exc_info=True)
@ -37,8 +39,9 @@ def task_instance_mutation_hook(ti):
        else:
            # If the queue is not found, it's a critical failure in the dispatching logic.
            # We fall back to the default queue but log it as a high-severity warning.
-            logger.warning(f"MUTATION HOOK: Could not find worker queue in run_id '{ti.run_id}'. Falling back to 'queue-dl'. Pinning will fail.")
+            fallback_queue = 'queue-auth' if 'auth' in ti.dag_id else 'queue-dl'
-            ti.queue = 'queue-dl'
+            logger.warning(f"MUTATION HOOK: Could not find worker queue in run_id '{ti.run_id}'. Falling back to '{fallback_queue}'. Pinning will fail.")
            ti.queue = fallback_queue
 # --- Hook Registration ---
--- a/airflow/configs/docker-compose-dl.yaml.j2
+++ b/airflow/configs/docker-compose-dl.yaml.j2
@ -14,7 +14,6 @@ x-airflow-common:
  # If you built a custom image for master, you need to push it to a registry
  # and reference it here.
  image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
  build: .
  # Add extra hosts here to allow workers to resolve other hosts by name.
  # This section is auto-generated by Ansible from the inventory.
  extra_hosts:
@ -30,7 +29,7 @@ x-airflow-common:
    AIRFLOW__CORE__PARALLELISM: 128
    AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 64
-    AIRFLOW__SCHEDULER__PARSING_PROCESSES: 4
+    AIRFLOW__SCHEDULER__PARSING_PROCESSES: 8
    AIRFLOW__WEBSERVER__WORKERS: 5
    AIRFLOW__WEBSERVER__WORKER_CLASS: "gevent"
@ -75,21 +74,21 @@ x-airflow-common:
    - ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
    - ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
    - ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
    # Mount the generated pangramia package to ensure workers have the latest version
    - ${AIRFLOW_PROJ_DIR:-.}/pangramia:/app/pangramia
  # Use AIRFLOW_UID from .env file to fix permission issues. GID is set to 0 for compatibility with the Airflow image.
  user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:0"
 services:
-  airflow-worker:
+  airflow-worker-dl:
    <<: *airflow-common
-    container_name: airflow-dl-worker-1
+    container_name: airflow-worker-dl-1
    hostname: ${HOSTNAME:-dl001}
-    # The worker now listens on the generic queue AND its own dedicated queue.
+    # The DL worker listens on the generic dl queue AND its own dedicated queue.
    # The hostname is dynamically inserted into the queue name.
    command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
    deploy:
      resources:
        limits:
          # Increased from 4G to 8G to support higher memory per child process.
          memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
        reservations:
          memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
@ -103,26 +102,18 @@ services:
      start_period: 30s
    environment:
      <<: *airflow-common-env
-      HOSTNAME: ${HOSTNAME:-dl001}  # Explicitly set inside container
+      HOSTNAME: ${HOSTNAME:-dl001}
      DUMB_INIT_SETSID: "0"
      AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
      AIRFLOW__CELERY__WORKER_TAGS: "dl"
      AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
-      # Use autoscaling to adjust number of workers based on load.
+      AIRFLOW__CELERY__WORKER_AUTOSCALE: "16,8"
      # Format is max_concurrency,min_concurrency.
      AIRFLOW__CELERY__WORKER_AUTOSCALE: "16,4"
      # Use prefork pool for better compatibility with blocking libraries.
      AIRFLOW__CELERY__POOL: "prefork"
      AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
      AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
      AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
      AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
      # Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
      # This value is in KB. 512 * 1024 = 524288.
      AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288"  # 512MB
    # The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
    # It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
    # hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
    ports:
      - "8793:8793"
    networks:
@ -130,6 +121,46 @@ services:
      - proxynet
    restart: always
  airflow-worker-auth:
    <<: *airflow-common
    container_name: airflow-worker-auth-1
    hostname: ${HOSTNAME:-auth001}
    # The Auth worker listens on the generic auth queue AND its own dedicated queue.
    command: airflow celery worker -q queue-auth,queue-auth-${HOSTNAME:-auth001}
    deploy:
      resources:
        limits:
          memory: ${AIRFLOW_WORKER_AUTH_MEM_LIMIT:-4G}
        reservations:
          memory: ${AIRFLOW_WORKER_AUTH_MEM_RESERV:-1G}
    healthcheck:
      test:
        - "CMD-SHELL"
        - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-auth@$$(hostname)"'
      interval: 30s
      timeout: 30s
      retries: 5
      start_period: 30s
    environment:
      <<: *airflow-common-env
      HOSTNAME: ${HOSTNAME:-auth001}
      DUMB_INIT_SETSID: "0"
      AIRFLOW__CELERY__WORKER_QUEUES: "queue-auth,queue-auth-${HOSTNAME:-auth001}"
      AIRFLOW__CELERY__WORKER_TAGS: "auth"
      AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
      # Auth tasks are less resource intensive but we want fewer of them to avoid service overload.
      AIRFLOW__CELERY__WORKER_AUTOSCALE: "2,1"
      AIRFLOW__CELERY__POOL: "prefork"
      AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
      AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
      AIRFLOW__CELERY__WORKER_NAME: "worker-auth@%h"
      AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
      AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "262144"  # 256MB
    networks:
      - default
      - proxynet
    restart: always
  docker-socket-proxy:
    profiles:
      - disabled
--- a/airflow/configs/docker-compose-dl.yaml.v1.j2
+++ b/airflow/configs/docker-compose-dl.yaml.v1.j2
@ -0,0 +1,151 @@
 # Airflow remote DL worker configuration.
 # This file should be used on a remote machine to run a download worker.
 # It requires a master Airflow instance running with services exposed.
 #
 # Before running, create a .env file in this directory with:
 # MASTER_HOST_IP=... a.b.c.d ... # IP address of the machine running docker-compose-master.yaml
 # POSTGRES_PASSWORD=... # The password for the PostgreSQL database from the master compose file
 # REDIS_PASSWORD=... # The password for Redis from the master compose file
 # AIRFLOW_UID=... # User ID for file permissions, should match master
 ---
 x-airflow-common:
  &airflow-common
  # This should point to the same image used by the master.
  # If you built a custom image for master, you need to push it to a registry
  # and reference it here.
  image: ${AIRFLOW_IMAGE_NAME:-pangramia/ytdlp-ops-airflow:latest}
  # Add extra hosts here to allow workers to resolve other hosts by name.
  # This section is auto-generated by Ansible from the inventory.
  extra_hosts:
 {% for host in groups['all'] %}
    - "{{ hostvars[host]['inventory_hostname'] }}:{{ hostvars[host]['ansible_host'] | default(hostvars[host]['inventory_hostname']) }}"
 {% endfor %}
  env_file:
    # The .env file is located in the project root (e.g., /srv/airflow_dl_worker),
    # so we provide an absolute path to it.
    - "{{ airflow_worker_dir }}/.env"
  environment:
    &airflow-common-env
    AIRFLOW__CORE__PARALLELISM: 128
    AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG: 64
    AIRFLOW__SCHEDULER__PARSING_PROCESSES: 8
    AIRFLOW__WEBSERVER__WORKERS: 5
    AIRFLOW__WEBSERVER__WORKER_CLASS: "gevent"
    AIRFLOW__LOGGING__SECRET_MASK_EXCEPTION_ARGS: False
    # Prevent slow webserver when low memory?
    GUNICORN_CMD_ARGS: --max-requests 20 --max-requests-jitter 3 --worker-tmp-dir  /dev/shm
    # Airflow Core
    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
    AIRFLOW__CORE__FERNET_KEY: '' # Should be same as master, but worker does not need it.
    # Backend connections - These should point to the master node
    # Set MASTER_HOST_IP, POSTGRES_PASSWORD, and REDIS_PASSWORD in your .env file
    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow
    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql+psycopg2://airflow:${{ '{' }}POSTGRES_PASSWORD{{ '}' }}@${{ '{' }}MASTER_HOST_IP{{ '}' }}:{{ postgres_port }}/airflow
    AIRFLOW__CELERY__BROKER_URL: redis://:${REDIS_PASSWORD}@${MASTER_HOST_IP}:{{ redis_port }}/0
    # Remote Logging - connection is configured directly via environment variables
    #_PIP_ADDITIONAL_REQUIREMENTS: ${{ '{' }}_PIP_ADDITIONAL_REQUIREMENTS:- apache-airflow-providers-docker apache-airflow-providers-http  thrift>=0.16.0,<=0.20.0 backoff>=2.2.1 python-dotenv==1.0.1 psutil>=5.9.0 apache-airflow-providers-amazon{{ '}' }}
    AIRFLOW__LOGGING__REMOTE_LOGGING: "True"
    AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER: "s3://airflow-logs"
    AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID: minio_default
    AIRFLOW__LOGGING__ENCRYPT_S3_LOGS: "False"
    #AIRFLOW__LOGGING__LOG_ID_TEMPLATE: "{dag_id}-{task_id}-{run_id}-{try_number}"
    AIRFLOW__WEBSERVER__SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
    AIRFLOW__CORE__INTERNAL_API_SECRET_KEY: 'qmALu5JCAW0518WGAqkVZQ=='
    AIRFLOW__CORE__LOCAL_SETTINGS_PATH: "/opt/airflow/config/custom_task_hooks.py"
  volumes:
    # Mount dags to get any utility scripts, but the worker will pull the DAG from the DB
    - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
    # Mount logs locally in case remote logging fails
    - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
    # Mount config for local settings and other configurations
    - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
    - ${AIRFLOW_PROJ_DIR:-.}/config/airflow.cfg:/opt/airflow/airflow.cfg
    # Mount download directories
    - ${AIRFLOW_PROJ_DIR:-.}/downloadfiles:/opt/airflow/downloadfiles
    - ${AIRFLOW_PROJ_DIR:-.}/addfiles:/opt/airflow/addfiles
    - ${AIRFLOW_PROJ_DIR:-.}/inputfiles:/opt/airflow/inputfiles
    # Mount the generated pangramia package to ensure workers have the latest version
    - ${AIRFLOW_PROJ_DIR:-.}/pangramia:/app/pangramia
  # Use AIRFLOW_UID from .env file to fix permission issues. GID is set to 0 for compatibility with the Airflow image.
  user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:0"
 services:
  airflow-worker:
    <<: *airflow-common
    container_name: airflow-dl-worker-1
    hostname: ${HOSTNAME:-dl001}
    # The worker now listens on the generic queue AND its own dedicated queue.
    # The hostname is dynamically inserted into the queue name.
    command: airflow celery worker -q queue-dl,queue-dl-${HOSTNAME:-dl001}
    deploy:
      resources:
        limits:
          # Increased from 4G to 8G to support higher memory per child process.
          memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_LIMIT:-8G}
        reservations:
          memory: ${AIRFLOW_WORKER_DOWNLOAD_MEM_RESERV:-2G}
    healthcheck:
      test:
        - "CMD-SHELL"
        - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "worker-dl@$$(hostname)"'
      interval: 30s
      timeout: 30s
      retries: 5
      start_period: 30s
    environment:
      <<: *airflow-common-env
      HOSTNAME: ${HOSTNAME:-dl001}  # Explicitly set inside container
      DUMB_INIT_SETSID: "0"
      AIRFLOW__CELERY__WORKER_QUEUES: "queue-dl,queue-dl-${HOSTNAME:-dl001}"
      AIRFLOW__CELERY__WORKER_TAGS: "dl"
      AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER: "1"
      # Use autoscaling to adjust number of workers based on load.
      # Format is max_concurrency,min_concurrency.
      AIRFLOW__CELERY__WORKER_AUTOSCALE: "16,8"
      # Use prefork pool for better compatibility with blocking libraries.
      AIRFLOW__CELERY__POOL: "prefork"
      AIRFLOW__CELERY__TASK_ACKS_LATE: "False"
      AIRFLOW__CELERY__OPERATION_TIMEOUT: "2.0"
      AIRFLOW__CELERY__WORKER_NAME: "worker-dl@%h"
      AIRFLOW__CELERY__WORKER_MAX_TASKS_PER_CHILD: "100"
      # Increased from 256MB to 512MB for memory-intensive yt-dlp tasks.
      # This value is in KB. 512 * 1024 = 524288.
      AIRFLOW__CELERY__WORKER_MAX_MEMORY_PER_CHILD: "524288"  # 512MB
    # The hostname is now managed by Docker Compose to ensure uniqueness when scaling.
    # It will be generated based on project, service, and replica number (e.g., airflow-airflow-dl-worker-1).
    # hostname: "dl-worker-${HOSTNAME_SUFFIX:-$$(hostname)}"
    ports:
      - "8793:8793"
    networks:
      - default
      - proxynet
    restart: always
  docker-socket-proxy:
    profiles:
      - disabled
    image: tecnativa/docker-socket-proxy:0.1.1
    environment:
      CONTAINERS: 1
      IMAGES: 1
      AUTH: 1
      POST: 1
    privileged: true
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    restart: always
 networks:
  proxynet:
    name: airflow_proxynet
    external: true
--- a/airflow/configs/docker-compose-master.yaml.j2
+++ b/airflow/configs/docker-compose-master.yaml.j2
@ -112,6 +112,8 @@ x-airflow-common:
    - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/downloadfiles:/opt/airflow/downloadfiles
    - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/addfiles:/opt/airflow/addfiles
    - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/inputfiles:/opt/airflow/inputfiles
    # Mount the generated pangramia package to ensure master services have the latest version
    - ${{ '{' }}AIRFLOW_PROJ_DIR:-.{{ '}' }}/pangramia:/app/pangramia
  user: "${{ '{' }}AIRFLOW_UID:-50000{{ '}' }}:0"
  depends_on:
    &airflow-common-depends-on
@ -142,7 +144,7 @@ services:
    volumes:
      - ./postgres-data:/var/lib/postgresql/data
    ports:
-      - "{{ postgres_port }}:5432"
+      - "${{ '{' }}POSTGRES_PORT:-5432{{ '}' }}:5432"
    healthcheck:
      test: ["CMD", "pg_isready", "-U", "airflow"]
      interval: 10s
@ -179,7 +181,7 @@ services:
    expose:
      - 6379
    ports:
-      - "{{ redis_port }}:6379"
+      - "${{ '{' }}REDIS_PORT:-6379{{ '}' }}:6379"
    healthcheck:
      test: ["CMD", "redis-cli", "-a", "${{ '{' }}REDIS_PASSWORD:-rOhTAIlTFFylXsjhqwxnYxDChFc{{ '}' }}", "ping"]
      interval: 10s
@ -405,6 +407,20 @@ services:
      airflow-init:
        condition: service_completed_successfully
  airflow-regression-runner:
    <<: *airflow-common
    entrypoint: ""
    container_name: airflow-regression-runner
    command: ["tail", "-f", "/dev/null"]
    hostname: ${{ '{' }}HOSTNAME{{ '}' }}
    environment:
      <<: *airflow-common-env
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully
  airflow-init:
    <<: *airflow-common
    depends_on:
--- a/airflow/configs/docker-compose-ytdlp-ops.yaml.j2
+++ b/airflow/configs/docker-compose-ytdlp-ops.yaml.j2
@ -8,6 +8,34 @@ include:
 {% endif %}
 services:
  bgutil-provider:
    image: brainicism/bgutil-ytdlp-pot-provider
    container_name: bgutil-provider
    init: true
    ports:
      - "4416:4416"
    restart: unless-stopped
    networks:
      - proxynet
  context-prepper:
    image: busybox:latest
    restart: "no"
    volumes:
      - ./context:/app/context
    networks:
      - proxynet
    command:
      - "/bin/sh"
      - "-c"
      - |
        set -e
        CONTEXT_BASE_DIR="/app/context"
        TIMESTAMP_DIR="$${CONTEXT_BASE_DIR}/context-data_$$(date +%Y%m%d_%H%M%S)"
        mkdir -p "$${TIMESTAMP_DIR}"
        ln -sfn "$${TIMESTAMP_DIR}" "$${CONTEXT_BASE_DIR}/context-data"
        echo "Context prepper finished. Data will be in: $${TIMESTAMP_DIR}"
  envoy:
    image: envoyproxy/envoy:v1.29-latest
    {% if service_role != 'management' %}
@ -35,16 +63,30 @@ services:
    # container_name is omitted; Docker will use the service name for DNS.
    # This service depends on the camoufox-group service, which ensures all camoufox
    # instances are started before this service.
 {% if service_role is defined and service_role != 'management' %}
    depends_on:
-      - camoufox-group
+      context-prepper:
        condition: service_completed_successfully
 {% if service_role is defined and service_role != 'management' %}
      camoufox-group:
        condition: service_started
 {% endif %}
    # Ports are no longer exposed directly. Envoy will connect to them on the internal network.
    # entrypoint:
    #   - /bin/sh
    #   - -c
    #   - |
    #     set -e
    #     echo "[$(date)] Updating yt-dlp to latest nightly master..."
    #     python3 -m pip install -U --pre "yt-dlp[default]" --upgrade-strategy eager --force-reinstall --no-cache-dir
    #     echo "[$(date)] yt-dlp updated to:"
    #     yt-dlp --version
    #     echo "[$(date)] Starting original entrypoint..."
    #     exec /usr/local/bin/docker-entrypoint.sh "$$@"
    env_file:
      - ./.env # Path is relative to the project directory
    volumes:
-      - context-data:/app/context-data
+      - ./context:/app/context
-      - ./logs/communication_logs:/app/communication_logs
+      - ./logs/yt-dlp-ops/communication_logs:/app/logs/yt-dlp-ops/communication_logs
 {% if service_role != 'management' %}
      # Mount the generated endpoints file to make it available to the server
      - ./configs/camoufox_endpoints.json:/app/config/camoufox_endpoints.json:ro
@ -72,19 +114,24 @@ services:
      - "${REDIS_PORT:-52909}"
      - "--redis-password"
      - "${REDIS_PASSWORD}"
      - "--account-active-duration-min"
      - "${ACCOUNT_ACTIVE_DURATION_MIN:-30}"
      - "--account-cooldown-duration-min"
      - "${ACCOUNT_COOLDOWN_DURATION_MIN:-60}"
      - "--service-role"
      - "{{ service_role }}"
      # --- S3 Logging Parameters ---
      - "--s3-endpoint-url"
      - "${S3_ENDPOINT_URL}"
      - "--s3-access-key-id"
      - "${S3_ACCESS_KEY_ID}"
      - "--s3-secret-access-key"
      - "${S3_SECRET_ACCESS_KEY}"
      - "--s3-region-name"
      - "${S3_REGION_NAME}"
 {% if service_role is defined and service_role != 'management' %}
      # --- Parameters for worker/all-in-one roles ONLY ---
      - "--script-dir"
      - "/app"
      - "--context-dir"
-      - "/app/context-data"
+      - "/app/context/context-data"
      - "--clean-context-dir"
      - "--clients"
      - "${YT_CLIENTS:-web,mweb,ios,android}"
@ -94,13 +141,13 @@ services:
      - "/app/config/camoufox_endpoints.json"
      - "--print-tokens"
      - "--stop-if-no-proxy"
      - "--comms-log-root-dir"
      - "/app/logs/yt-dlp-ops/communication_logs"
      - "--bgutils-no-innertube"
 {% endif %}
    restart: unless-stopped
    pull_policy: always
 volumes:
  context-data:
 networks:
  proxynet:
    name: airflow_proxynet
--- a/airflow/dags/scripts/regression.py
+++ b/airflow/dags/scripts/regression.py
@ -0,0 +1,636 @@
 # -*- coding: utf-8 -*-
 """
 Regression testing script for the ytdlp-ops system.
 This script orchestrates a regression test by:
 1. Populating a Redis queue with video URLs from an input file.
 2. Triggering the `ytdlp_ops_orchestrator` Airflow DAG to start processing.
 3. Monitoring the progress of the processing for a specified duration.
 4. Generating a report of any failures.
 5. Optionally cleaning up the Redis queues after the test.
 """
 import argparse
 import csv
 import json
 import logging
 import os
 import re
 import requests
 import subprocess
 import signal
 import sys
 import time
 from datetime import datetime, timedelta
 from pathlib import Path
 import redis
 from tabulate import tabulate
 # It's safe to import these as the script runs in the same container as Airflow
 # where the yt_ops_services package is installed.
 try:
    from yt_ops_services.client_utils import get_thrift_client, format_timestamp
    from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
 except ImportError:
    logging.error("Could not import Thrift modules. Ensure this script is run in the 'airflow-regression-runner' container.")
    sys.exit(1)
 # --- Configuration ---
 logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 INTERRUPTED = False
 def signal_handler(sig, frame):
    """Handles Ctrl+C interruption."""
    global INTERRUPTED
    if not INTERRUPTED:
        logging.warning("Ctrl+C detected. Initiating graceful shutdown...")
        INTERRUPTED = True
    else:
        logging.warning("Second Ctrl+C detected. Forcing exit.")
        sys.exit(1)
 # --- Helper Functions ---
 def _get_redis_client(redis_url: str):
    """Gets a Redis client from a URL."""
    try:
        # from_url is the modern way to connect and handles password auth
        client = redis.from_url(redis_url, decode_responses=True)
        client.ping()
        logging.info(f"Successfully connected to Redis at {client.connection_pool.connection_kwargs.get('host')}:{client.connection_pool.connection_kwargs.get('port')}")
        return client
    except redis.exceptions.ConnectionError as e:
        logging.error(f"Failed to connect to Redis: {e}")
        sys.exit(1)
    except Exception as e:
        logging.error(f"An unexpected error occurred while connecting to Redis: {e}")
        sys.exit(1)
 def _get_webserver_url():
    """
    Determines the Airflow webserver URL, prioritizing MASTER_HOST_IP from .env.
    """
    master_host_ip = os.getenv("MASTER_HOST_IP")
    if master_host_ip:
        url = f"http://{master_host_ip}:8080"
        logging.info(f"Using MASTER_HOST_IP for webserver URL: {url}")
        return url
    # Fallback to AIRFLOW_WEBSERVER_URL or the default service name
    url = os.getenv("AIRFLOW_WEBSERVER_URL", "http://airflow-webserver:8080")
    logging.info(f"Using default webserver URL: {url}")
    return url
 def _normalize_to_url(item: str) -> str | None:
    """
    Validates if an item is a recognizable YouTube URL or video ID,
    and normalizes it to a standard watch URL format.
    """
    if not item:
        return None
    video_id_pattern = r"^[a-zA-Z0-9_-]{11}$"
    if re.match(video_id_pattern, item):
        return f"https://www.youtube.com/watch?v={item}"
    url_patterns = [r"(?:v=|\/v\/|youtu\.be\/|embed\/|shorts\/)([a-zA-Z0-9_-]{11})"]
    for pattern in url_patterns:
        match = re.search(pattern, item)
        if match:
            return f"https://www.youtube.com/watch?v={match.group(1)}"
    logging.warning(f"Could not recognize '{item}' as a valid YouTube URL or video ID.")
    return None
 def _read_input_file(file_path: str) -> list[str]:
    """Reads video IDs/URLs from a file (CSV or JSON list)."""
    path = Path(file_path)
    if not path.is_file():
        logging.error(f"Input file not found: {file_path}")
        sys.exit(1)
    content = path.read_text(encoding='utf-8')
    # Try parsing as JSON list first
    if content.strip().startswith('['):
        try:
            data = json.loads(content)
            if isinstance(data, list):
                logging.info(f"Successfully parsed {file_path} as a JSON list.")
                return [str(item) for item in data]
        except json.JSONDecodeError:
            logging.warning("File looks like JSON but failed to parse. Will try treating as CSV/text.")
    # Fallback to CSV/text (one item per line)
    items = []
    # Use io.StringIO to handle the content as a file for the csv reader
    from io import StringIO
    # Sniff to see if it has a header
    try:
        has_header = csv.Sniffer().has_header(content)
    except csv.Error:
        has_header = False # Not a CSV, treat as plain text
    reader = csv.reader(StringIO(content))
    if has_header:
        next(reader) # Skip header row
    for row in reader:
        if row:
            items.append(row[0].strip()) # Assume the ID/URL is in the first column
    logging.info(f"Successfully parsed {len(items)} items from {file_path} as CSV/text.")
    return items
 def _get_api_auth():
    """Gets Airflow API credentials from environment variables."""
    username = os.getenv("AIRFLOW_ADMIN_USERNAME", "admin")
    password = os.getenv("AIRFLOW_ADMIN_PASSWORD")
    if not password:
        logging.error("AIRFLOW_ADMIN_PASSWORD not found in environment. Cannot interact with API.")
        return None, None
    return username, password
 def _pause_dag(dag_id: str, is_paused: bool = True):
    """Pauses or unpauses an Airflow DAG via the REST API."""
    logging.info(f"Attempting to {'pause' if is_paused else 'unpause'} DAG: {dag_id}...")
    username, password = _get_api_auth()
    if not username:
        return
    webserver_url = _get_webserver_url()
    endpoint = f"{webserver_url}/api/v1/dags/{dag_id}"
    payload = {"is_paused": is_paused}
    try:
        response = requests.patch(endpoint, auth=(username, password), json=payload, timeout=30)
        response.raise_for_status()
        logging.info(f"Successfully {'paused' if is_paused else 'unpaused'} DAG '{dag_id}'.")
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to {'pause' if is_paused else 'unpause'} DAG '{dag_id}': {e}")
        if e.response is not None:
            logging.error(f"Response: {e.response.text}")
 def _fail_running_dag_runs(dag_id: str):
    """Finds all running DAG runs for a given DAG and marks them as failed."""
    logging.info(f"Attempting to fail all running instances of DAG '{dag_id}'...")
    username, password = _get_api_auth()
    if not username:
        return
    webserver_url = _get_webserver_url()
    list_endpoint = f"{webserver_url}/api/v1/dags/{dag_id}/dagRuns?state=running"
    try:
        # Get running DAGs
        response = requests.get(list_endpoint, auth=(username, password), timeout=30)
        response.raise_for_status()
        running_runs = response.json().get("dag_runs", [])
        if not running_runs:
            logging.info(f"No running DAG runs found for '{dag_id}'.")
            return
        logging.info(f"Found {len(running_runs)} running DAG run(s) to fail.")
        for run in running_runs:
            dag_run_id = run["dag_run_id"]
            update_endpoint = f"{webserver_url}/api/v1/dags/{dag_id}/dagRuns/{dag_run_id}"
            payload = {"state": "failed"}
            try:
                update_response = requests.patch(update_endpoint, auth=(username, password), json=payload, timeout=30)
                update_response.raise_for_status()
                logging.info(f"  - Successfully marked DAG run '{dag_run_id}' as failed.")
            except requests.exceptions.RequestException as e:
                logging.error(f"  - Failed to mark DAG run '{dag_run_id}' as failed: {e}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to list running DAG runs for '{dag_id}': {e}")
        if e.response is not None:
            logging.error(f"Response: {e.response.text}")
 # --- Core Logic Functions ---
 def step_0_populate_queue(redis_client, queue_name: str, input_file: str):
    """Reads URLs from a file and populates the Redis inbox queue."""
    logging.info("--- Step 0: Populating Redis Queue ---")
    raw_items = _read_input_file(input_file)
    if not raw_items:
        logging.error("No items found in the input file. Aborting.")
        sys.exit(1)
    valid_urls = []
    for item in raw_items:
        url = _normalize_to_url(item)
        if url and url not in valid_urls:
            valid_urls.append(url)
    if not valid_urls:
        logging.error("No valid YouTube URLs or IDs were found in the input file. Aborting.")
        sys.exit(1)
    inbox_queue = f"{queue_name}_inbox"
    logging.info(f"Adding {len(valid_urls)} unique and valid URLs to Redis queue '{inbox_queue}'...")
    with redis_client.pipeline() as pipe:
        for url in valid_urls:
            pipe.rpush(inbox_queue, url)
        pipe.execute()
    logging.info(f"Successfully populated queue. Total items in '{inbox_queue}': {redis_client.llen(inbox_queue)}")
    return len(valid_urls)
 def step_1_trigger_orchestrator(args: argparse.Namespace):
    """Triggers the ytdlp_ops_orchestrator DAG using the Airflow REST API."""
    logging.info("--- Step 1: Triggering Orchestrator DAG via REST API ---")
    # Get API details from environment variables
    webserver_url = _get_webserver_url()
    api_endpoint = f"{webserver_url}/api/v1/dags/ytdlp_ops_orchestrator/dagRuns"
    # Default admin user is 'admin'
    username = os.getenv("AIRFLOW_ADMIN_USERNAME", "admin")
    password = os.getenv("AIRFLOW_ADMIN_PASSWORD")
    if not password:
        logging.error("AIRFLOW_ADMIN_PASSWORD not found in environment. Please set it in your .env file.")
        sys.exit(1)
    # Construct the configuration for the DAG run
    conf = {
        "total_workers": args.workers,
        "workers_per_bunch": args.workers_per_bunch,
        "clients": args.client,
    }
    payload = {
        "conf": conf
    }
    logging.info(f"Triggering DAG at endpoint: {api_endpoint}")
    try:
        response = requests.post(
            api_endpoint,
            auth=(username, password),
            json=payload,
            timeout=30 # 30 second timeout
        )
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
        logging.info("Successfully triggered the orchestrator DAG.")
        logging.debug(f"Airflow API response:\n{response.json()}")
    except requests.exceptions.RequestException as e:
        logging.error("Failed to trigger the orchestrator DAG via REST API.")
        logging.error(f"Error: {e}")
        if e.response is not None:
            logging.error(f"Response status code: {e.response.status_code}")
            logging.error(f"Response text: {e.response.text}")
        sys.exit(1)
 def step_2_monitor_progress(args: argparse.Namespace, redis_client, queue_name: str, total_urls: int, run_time_min: int, interval_min: int, show_status: bool):
    """Monitors the Redis queues for the duration of the test."""
    logging.info("--- Step 2: Monitoring Progress ---")
    end_time = datetime.now() + timedelta(minutes=run_time_min)
    inbox_q = f"{queue_name}_inbox"
    progress_q = f"{queue_name}_progress"
    result_q = f"{queue_name}_result"
    fail_q = f"{queue_name}_fail"
    while datetime.now() < end_time and not INTERRUPTED:
        try:
            inbox_len = redis_client.llen(inbox_q)
            progress_len = redis_client.hlen(progress_q)
            result_len = redis_client.hlen(result_q)
            fail_len = redis_client.hlen(fail_q)
            processed = result_len + fail_len
            success_len = 0
            if result_len > 0:
                # This is inefficient but gives a more accurate success count
                results = redis_client.hgetall(result_q)
                success_len = sum(1 for v in results.values() if '"status": "success"' in v)
            logging.info(
                f"Progress: {processed}/{total_urls} | "
                f"Success: {success_len} | Failed: {fail_len} | "
                f"In Progress: {progress_len} | Inbox: {inbox_len}"
            )
            if show_status:
                # This function now connects directly to services to get status
                get_system_status(args, redis_client)
        except Exception as e:
            logging.error(f"Error while querying Redis for progress: {e}")
        # Wait for the interval, but check for interruption every second
        # for a more responsive shutdown.
        wait_until = time.time() + interval_min * 60
        while time.time() < wait_until and not INTERRUPTED:
            # Check if we are past the main end_time
            if datetime.now() >= end_time:
                break
            time.sleep(1)
    if INTERRUPTED:
        logging.info("Monitoring interrupted.")
    else:
        logging.info("Monitoring period has ended.")
 # --- System Status Functions (Direct Connect) ---
 def _list_proxy_statuses(client, server_identity=None):
    """Lists proxy statuses by connecting directly to the Thrift service."""
    logging.info(f"--- Proxy Statuses (Server: {server_identity or 'ALL'}) ---")
    try:
        statuses = client.getProxyStatus(server_identity)
        if not statuses:
            logging.info("No proxy statuses found.")
            return
        status_list = []
        headers = ["Server", "Proxy URL", "Status", "Success", "Failures", "Last Success", "Last Failure"]
        for s in statuses:
            status_list.append({
                "Server": s.serverIdentity, "Proxy URL": s.proxyUrl, "Status": s.status,
                "Success": s.successCount, "Failures": s.failureCount,
                "Last Success": format_timestamp(s.lastSuccessTimestamp),
                "Last Failure": format_timestamp(s.lastFailureTimestamp),
            })
        logging.info("\n" + tabulate(status_list, headers='keys', tablefmt='grid'))
    except (PBServiceException, PBUserException) as e:
        logging.error(f"Failed to get proxy statuses: {e.message}")
    except Exception as e:
        logging.error(f"An unexpected error occurred while getting proxy statuses: {e}", exc_info=True)
 def _list_account_statuses(client, redis_client, account_id=None):
    """Lists account statuses from Thrift, enriched with live Redis data."""
    logging.info(f"--- Account Statuses (Account: {account_id or 'ALL'}) ---")
    try:
        statuses = client.getAccountStatus(accountId=account_id, accountPrefix=None)
        if not statuses:
            logging.info("No account statuses found.")
            return
        status_list = []
        for s in statuses:
            status_str = s.status
            if 'RESTING' in status_str:
                try:
                    expiry_ts_bytes = redis_client.hget(f"account_status:{s.accountId}", "resting_until")
                    if expiry_ts_bytes:
                        expiry_ts = float(expiry_ts_bytes)
                        now = datetime.now().timestamp()
                        if now < expiry_ts:
                            remaining_seconds = int(expiry_ts - now)
                            status_str = f"RESTING ({remaining_seconds}s left)"
                except Exception:
                    pass # Ignore if parsing fails
            last_success = float(s.lastSuccessTimestamp) if s.lastSuccessTimestamp else 0
            last_failure = float(s.lastFailureTimestamp) if s.lastFailureTimestamp else 0
            last_activity = max(last_success, last_failure)
            status_list.append({
                "Account ID": s.accountId, "Status": status_str, "Success": s.successCount,
                "Failures": s.failureCount, "Last Success": format_timestamp(s.lastSuccessTimestamp),
                "Last Failure": format_timestamp(s.lastFailureTimestamp), "Last Proxy": s.lastUsedProxy or "N/A",
                "_last_activity": last_activity,
            })
        status_list.sort(key=lambda item: item.get('_last_activity', 0), reverse=True)
        for item in status_list:
            del item['_last_activity']
        logging.info("\n" + tabulate(status_list, headers='keys', tablefmt='grid'))
    except (PBServiceException, PBUserException) as e:
        logging.error(f"Failed to get account statuses: {e.message}")
    except Exception as e:
        logging.error(f"An unexpected error occurred while getting account statuses: {e}", exc_info=True)
 def _list_client_statuses(redis_client):
    """Lists client statistics from Redis."""
    logging.info("--- Client Statuses ---")
    try:
        stats_key = "client_stats"
        all_stats_raw = redis_client.hgetall(stats_key)
        if not all_stats_raw:
            logging.info("No client stats found in Redis.")
            return
        status_list = []
        for client, stats_json in all_stats_raw.items():
            try:
                stats = json.loads(stats_json)
                def format_latest(data):
                    if not data: return "N/A"
                    ts = format_timestamp(data.get('timestamp'))
                    url = data.get('url', 'N/A')
                    video_id_match = re.search(r'v=([a-zA-Z0-9_-]{11})', url)
                    video_id = video_id_match.group(1) if video_id_match else 'N/A'
                    return f"{ts} ({video_id})"
                status_list.append({
                    "Client": client, "Success": stats.get('success_count', 0),
                    "Failures": stats.get('failure_count', 0),
                    "Last Success": format_latest(stats.get('latest_success')),
                    "Last Failure": format_latest(stats.get('latest_failure')),
                })
            except (json.JSONDecodeError, AttributeError):
                status_list.append({"Client": client, "Success": "ERROR", "Failures": "ERROR", "Last Success": "Parse Error", "Last Failure": "Parse Error"})
        status_list.sort(key=lambda item: item.get('Client', ''))
        logging.info("\n" + tabulate(status_list, headers='keys', tablefmt='grid'))
    except Exception as e:
        logging.error(f"An unexpected error occurred while getting client statuses: {e}", exc_info=True)
 def get_system_status(args: argparse.Namespace, redis_client):
    """Connects to services and prints status tables."""
    logging.info("--- Getting System Status ---")
    client, transport = None, None
    try:
        client, transport = get_thrift_client(args.management_host, args.management_port)
        _list_proxy_statuses(client)
        _list_account_statuses(client, redis_client)
        _list_client_statuses(redis_client)
    except Exception as e:
        logging.error(f"Could not get system status: {e}")
    finally:
        if transport and transport.isOpen():
            transport.close()
 def step_3_generate_report(redis_client, queue_name: str, report_file: str | None):
    """Generates a CSV report of failed items."""
    logging.info("--- Step 3: Generating Report ---")
    fail_q = f"{queue_name}_fail"
    failed_items = redis_client.hgetall(fail_q)
    if not failed_items:
        logging.info("No items found in the fail queue. No report will be generated.")
        return
    logging.info(f"Found {len(failed_items)} failed items. Writing to report...")
    report_data = []
    for url, data_json in failed_items.items():
        try:
            data = json.loads(data_json)
            error_details = data.get('error_details', {})
            report_data.append({
                'url': url,
                'video_id': _normalize_to_url(url).split('v=')[-1] if _normalize_to_url(url) else 'N/A',
                'error_message': error_details.get('error_message', 'N/A'),
                'error_code': error_details.get('error_code', 'N/A'),
                'proxy_url': error_details.get('proxy_url', 'N/A'),
                'timestamp': datetime.fromtimestamp(data.get('end_time', 0)).isoformat(),
            })
        except (json.JSONDecodeError, AttributeError):
            report_data.append({'url': url, 'video_id': 'N/A', 'error_message': 'Could not parse error data', 'error_code': 'PARSE_ERROR', 'proxy_url': 'N/A', 'timestamp': 'N/A'})
    if report_file:
        try:
            with open(report_file, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=report_data[0].keys())
                writer.writeheader()
                writer.writerows(report_data)
            logging.info(f"Successfully wrote report to {report_file}")
        except IOError as e:
            logging.error(f"Could not write report to file {report_file}: {e}")
    else:
        # Print to stdout if no file is specified
        logging.info("--- Failure Report (stdout) ---")
        for item in report_data:
            logging.info(f"URL: {item['url']}, Error: {item['error_code']} - {item['error_message']}")
        logging.info("--- End of Report ---")
 def handle_interruption(redis_client, queue_name, report_file):
    """Graceful shutdown logic for when the script is interrupted."""
    logging.warning("--- Interruption Detected: Starting Shutdown Procedure ---")
    # 1. Pause DAGs
    _pause_dag("ytdlp_ops_orchestrator")
    _pause_dag("ytdlp_ops_dispatcher")
    # 2. Fail running per_url jobs
    _fail_running_dag_runs("ytdlp_ops_worker_per_url")
    # 3. Generate report
    logging.info("Generating final report due to interruption...")
    step_3_generate_report(redis_client, queue_name, report_file)
    # Also print to stdout if a file was specified, so user sees it immediately
    if report_file:
        logging.info("Printing report to stdout as well...")
        step_3_generate_report(redis_client, queue_name, None)
 def step_4_cleanup_queues(redis_client, queue_name: str):
    """Cleans up the Redis queues used by the test."""
    logging.info("--- Step 4: Cleaning Up Queues ---")
    queues_to_delete = [
        f"{queue_name}_inbox",
        f"{queue_name}_progress",
        f"{queue_name}_result",
        f"{queue_name}_fail",
    ]
    logging.warning(f"This will delete the following Redis keys: {queues_to_delete}")
    deleted_count = redis_client.delete(*queues_to_delete)
    logging.info(f"Cleanup complete. Deleted {deleted_count} key(s).")
 def main():
    """Main function to parse arguments and run the regression test."""
    # Register the signal handler for Ctrl+C
    signal.signal(signal.SIGINT, signal_handler)
    parser = argparse.ArgumentParser(description="Run a regression test for the ytdlp-ops system.")
    # Environment
    parser.add_argument("--redis-host", type=str, default="redis", help="Hostname or IP address of the Redis server. Defaults to 'redis' for in-container execution.")
    parser.add_argument("--management-host", type=str, default=os.getenv("MANAGEMENT_SERVICE_HOST", "envoy-thrift-lb"), help="Hostname of the management Thrift service.")
    parser.add_argument("--management-port", type=int, default=int(os.getenv("MANAGEMENT_SERVICE_PORT", 9080)), help="Port of the management Thrift service.")
    # Test Configuration
    parser.add_argument("--client", type=str, required=True, help="Client persona to test (e.g., 'mweb').")
    parser.add_argument("--workers", type=int, required=True, help="Total number of worker loops to start.")
    parser.add_argument("--workers-per-bunch", type=int, default=1, help="Number of workers per bunch.")
    parser.add_argument("--run-time-min", type=int, required=True, help="How long to let the test run, in minutes.")
    parser.add_argument("--input-file", type=str, help="Path to a file containing video IDs/URLs. If not provided, the existing queue will be used.")
    # Monitoring & Reporting
    parser.add_argument("--progress-interval-min", type=int, default=2, help="How often to query and print progress, in minutes.")
    parser.add_argument("--report-file", type=str, help="Path to a CSV file to write the list of failed URLs to.")
    parser.add_argument("--show-status", action="store_true", help="If set, show proxy and account statuses during progress monitoring.")
    # Actions
    parser.add_argument("--cleanup", action="store_true", help="If set, clear the Redis queues after the test completes.")
    parser.add_argument("--skip-populate", action="store_true", help="If set, skip populating the queue (assumes it's already populated).")
    parser.add_argument("--skip-trigger", action="store_true", help="If set, skip triggering the orchestrator (assumes it's already running).")
    args = parser.parse_args()
    # --- Setup ---
    redis_password = os.getenv("REDIS_PASSWORD")
    if not redis_password:
        logging.error("REDIS_PASSWORD not found in environment. Please set it in your .env file.")
        sys.exit(1)
    # Use the provided redis-host, defaulting to 'redis' for in-container execution
    redis_url = f"redis://:{redis_password}@{args.redis_host}:6379/0"
    redis_client = _get_redis_client(redis_url)
    queue_name = "video_queue" # Hardcoded for now, could be an arg
    total_urls = 0
    # --- Execution ---
    if not args.skip_populate:
        if args.input_file:
            total_urls = step_0_populate_queue(redis_client, queue_name, args.input_file)
        else:
            logging.info("No input file provided, using existing queue.")
            total_urls = redis_client.llen(f"{queue_name}_inbox")
            if total_urls == 0:
                logging.warning("Queue is empty and no input file was provided. The test may not have any work to do.")
    else:
        total_urls = redis_client.llen(f"{queue_name}_inbox")
        logging.info(f"Skipping population. Found {total_urls} URLs in the inbox.")
    if not args.skip_trigger:
        step_1_trigger_orchestrator(args)
    else:
        logging.info("Skipping orchestrator trigger.")
    step_2_monitor_progress(args, redis_client, queue_name, total_urls, args.run_time_min, args.progress_interval_min, args.show_status)
    if INTERRUPTED:
        handle_interruption(redis_client, queue_name, args.report_file)
    else:
        step_3_generate_report(redis_client, queue_name, args.report_file)
    if args.cleanup:
        step_4_cleanup_queues(redis_client, queue_name)
    if INTERRUPTED:
        logging.warning("Regression test script finished due to user interruption.")
        sys.exit(130)  # Standard exit code for Ctrl+C
    else:
        logging.info("Regression test script finished.")
 if __name__ == "__main__":
    main()
--- a/airflow/dags/ytdlp_mgmt_proxy_account.py
+++ b/airflow/dags/ytdlp_mgmt_proxy_account.py
@ -4,6 +4,9 @@ DAG to manage the state of proxies and accounts used by the ytdlp-ops-server.
 from __future__ import annotations
 import logging
 import json
 import re
 import time
 from datetime import datetime
 import socket
@ -208,6 +211,112 @@ def _list_account_statuses(client, account_id, redis_conn_id):
        print(f"\nERROR: An unexpected error occurred: {e}\n")
 def _list_client_statuses(redis_conn_id):
    """Lists the status of different client types from Redis."""
    logger.info("Listing client statuses from Redis key 'client_stats'")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        stats_key = "client_stats"
        all_stats_raw = redis_client.hgetall(stats_key)
        if not all_stats_raw:
            print("\n--- Client Statuses ---\nNo client stats found in Redis.\n-----------------------\n")
            return
        from tabulate import tabulate
        status_list = []
        for client_bytes, stats_json_bytes in all_stats_raw.items():
            client_name = client_bytes.decode('utf-8')
            try:
                stats = json.loads(stats_json_bytes.decode('utf-8'))
                def format_latest(data):
                    if not data: return "N/A"
                    ts = format_timestamp(data.get('timestamp'))
                    url = data.get('url') or 'N/A'
                    machine = data.get('machine_id', 'N/A')
                    video_id_match = re.search(r'v=([a-zA-Z0-9_-]{11})', url)
                    video_id = video_id_match.group(1) if video_id_match else 'N/A'
                    return f"{ts}\nMachine: {machine}\nVideo ID: {video_id}"
                status_item = {
                    "Client": client_name,
                    "Success": stats.get('success_count', 0),
                    "Failures": stats.get('failure_count', 0),
                    "Last Success": format_latest(stats.get('latest_success')),
                    "Last Failure": format_latest(stats.get('latest_failure')),
                }
                status_list.append(status_item)
            except (json.JSONDecodeError, AttributeError) as e:
                logger.error(f"Could not parse stats for client '{client_name}': {e}")
                status_list.append({
                    "Client": client_name, "Success": "ERROR", "Failures": "ERROR",
                    "Last Success": "Could not parse data", "Last Failure": "Could not parse data"
                })
        status_list.sort(key=lambda item: item.get('Client', ''))
        print("\n--- Client Statuses ---")
        print(f"\n{tabulate(status_list, headers='keys', tablefmt='grid')}")
        print("-----------------------\n")
    except Exception as e:
        logger.error(f"An unexpected error occurred while getting client statuses: {e}", exc_info=True)
        print(f"\nERROR: An unexpected error occurred: {e}\n")
 def _list_activity_counters(redis_conn_id: str):
    """Lists current activity rates for proxies and accounts from Redis."""
    logger.info("Listing activity counters from Redis keys 'activity:per_proxy:*' and 'activity:per_account:*'")
    try:
        redis_client = _get_redis_client(redis_conn_id)
        from tabulate import tabulate
        now = time.time()
        def process_keys(pattern, entity_name):
            keys = redis_client.scan_iter(pattern)
            status_list = []
            for key_bytes in keys:
                key = key_bytes.decode('utf-8')
                entity_id = key.split(':', 2)[-1]
                # Clean up old entries before counting
                redis_client.zremrangebyscore(key, '-inf', now - 3660)  # Clean up > 1hr old
                count_1m = redis_client.zcount(key, now - 60, now)
                count_5m = redis_client.zcount(key, now - 300, now)
                count_1h = redis_client.zcount(key, now - 3600, now)
                if count_1h == 0:  # Don't show entities with no recent activity
                    continue
                status_list.append({
                    entity_name: entity_id,
                    "Activity (Last 1m)": count_1m,
                    "Activity (Last 5m)": count_5m,
                    "Activity (Last 1h)": count_1h,
                })
            status_list.sort(key=lambda item: item.get(entity_name, ''))
            print(f"\n--- {entity_name} Activity Counters ---")
            if not status_list:
                print(f"No recent activity found for {entity_name.lower()}s.")
            else:
                print(f"\n{tabulate(status_list, headers='keys', tablefmt='grid')}")
            print("-----------------------------------\n")
        process_keys("activity:per_proxy:*", "Proxy URL")
        process_keys("activity:per_account:*", "Account ID")
    except Exception as e:
        logger.error(f"An unexpected error occurred while getting activity counters: {e}", exc_info=True)
        print(f"\nERROR: An unexpected error occurred: {e}\n")
 def manage_system_callable(**context):
    """Main callable to interact with the system management endpoints."""
    # Log version for debugging
@ -218,7 +327,7 @@ def manage_system_callable(**context):
    action = params["action"]
    # For Thrift actions, use the new management host/port
-    if entity not in ["airflow_meta"]:
+    if entity not in ["airflow_meta", "activity_counters"]:
        host = params["management_host"]
        port = params["management_port"]
    else:
@ -232,8 +341,10 @@ def manage_system_callable(**context):
    valid_actions = {
        "proxy": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
        "account": ["list_with_status", "ban", "unban", "unban_all", "delete_from_redis"],
        "client": ["list_with_status", "delete_from_redis"],
        "accounts_and_proxies": ["list_with_status", "ban", "unban", "ban_all", "unban_all", "delete_from_redis"],
        "airflow_meta": ["clear_dag_runs"],
        "activity_counters": ["list_with_status"],
    }
    if action not in valid_actions.get(entity, []):
@ -287,7 +398,15 @@ def manage_system_callable(**context):
                # The session is committed automatically by the `with create_session()` context manager.
                logger.info(f"Successfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.")
                print(f"\nSuccessfully deleted {deleted_count} DagRun(s) for DAG '{dag_id}'.\n")
-            return # End execution
+            return  # End execution
    # --- Handle Activity Counter action ---
    if entity == "activity_counters":
        if action == "list_with_status":
            _list_activity_counters(params["redis_conn_id"])
            return  # End execution
        else:
            raise ValueError(f"Action '{action}' is not valid for entity 'activity_counters'. Only 'list_with_status' is supported.")
    # Handle Thrift-based deletion actions
    if action == "delete_from_redis":
@ -355,6 +474,15 @@ def manage_system_callable(**context):
                    print(f"\nSuccessfully deleted {proxy_result} proxy keys for server '{server_identity}' from Redis.\n")
                else:
                    print(f"\nSuccessfully deleted {proxy_result} proxy keys from Redis across ALL servers.\n")
            elif entity == "client":
                logger.info("Deleting all client stats from Redis...")
                redis_client = _get_redis_client(params["redis_conn_id"])
                result = redis_client.delete("client_stats")
                if result > 0:
                    print(f"\nSuccessfully deleted 'client_stats' key from Redis.\n")
                else:
                    print(f"\nKey 'client_stats' not found in Redis. Nothing to delete.\n")
        except (PBServiceException, PBUserException) as e:
            logger.error(f"Thrift error performing delete action: {e.message}", exc_info=True)
@ -374,7 +502,10 @@ def manage_system_callable(**context):
    try:
        client, transport = get_thrift_client(host, port)
-        if entity == "proxy":
+        if entity == "client":
            if action == "list_with_status":
                _list_client_statuses(params["redis_conn_id"])
        elif entity == "proxy":
            if action == "list_with_status":
                _list_proxy_statuses(client, server_identity)
            elif action == "ban":
@ -497,6 +628,13 @@ def manage_system_callable(**context):
                _list_account_statuses(client, account_prefix, params["redis_conn_id"])
        elif entity == "accounts_and_proxies":
            if action == "list_with_status":
                print("\n--- Listing statuses for Proxies, Accounts, and Clients ---")
                _list_proxy_statuses(client, server_identity)
                _list_account_statuses(client, account_id, params["redis_conn_id"])
                _list_client_statuses(params["redis_conn_id"])
                return # End execution for list_with_status
            print(f"\n--- Performing action '{action}' on BOTH Proxies and Accounts ---")
            # --- Proxy Action ---
@ -674,7 +812,7 @@ with DAG(
        "entity": Param(
            "accounts_and_proxies",
            type="string",
-            enum=["account", "proxy", "accounts_and_proxies", "airflow_meta"],
+            enum=["account", "proxy", "client", "accounts_and_proxies", "activity_counters", "airflow_meta"],
            description="The type of entity to manage.",
        ),
        "action": Param(
@ -698,6 +836,13 @@ with DAG(
            - `unban_all`: Sets the status of all accounts (or those matching a prefix in `account_id`) to `ACTIVE`.
            - `delete_from_redis`: **(Destructive)** Deletes account status from Redis via Thrift service. This permanently removes the account from being tracked by the system. If `account_id` is provided, it deletes that specific account. If `account_id` is provided as a prefix, it deletes all accounts matching that prefix. If `account_id` is empty, it deletes ALL accounts.
            #### Actions for `entity: client`
            - `list_with_status`: View success/failure statistics for each client type.
            - `delete_from_redis`: **(Destructive)** Deletes all client stats from Redis.
            #### Actions for `entity: activity_counters`
            - `list_with_status`: View current activity rates (ops/min, ops/hr) for proxies and accounts.
            #### Actions for `entity: accounts_and_proxies`
            - This entity performs the selected action on **both** proxies and accounts where applicable.
            - `list_with_status`: View statuses for both proxies and accounts.
@ -735,9 +880,9 @@ with DAG(
            description="The Airflow connection ID for the Redis server (used for 'delete_from_redis' and for fetching detailed account status).",
        ),
        "dag_id_to_manage": Param(
-            "ytdlp_ops_worker_per_url",
+            "ytdlp_ops_v01_worker_per_url",
            type="string",
-            enum=["ytdlp_ops_worker_per_url", "ytdlp_ops_orchestrator"],
+            enum=["ytdlp_ops_v01_orchestrator", "ytdlp_ops_v01_dispatcher", "ytdlp_ops_v01_worker_per_url", "ytdlp_ops_v02_orchestrator_auth", "ytdlp_ops_v02_dispatcher_auth", "ytdlp_ops_v02_worker_per_url_auth", "ytdlp_ops_v02_orchestrator_dl", "ytdlp_ops_v02_dispatcher_dl", "ytdlp_ops_v02_worker_per_url_dl"],
            title="[Airflow Meta] DAG ID",
            description="The DAG ID to perform the action on.",
        ),
--- a/airflow/dags/ytdlp_mgmt_queues.py
+++ b/airflow/dags/ytdlp_mgmt_queues.py
@ -254,7 +254,18 @@ def clear_queue_callable(**context):
    ti = context['task_instance']
    logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
    redis_conn_id = params['redis_conn_id']
-    queue_base_name = params['queue_base_name']
+    
    queue_system = params.get('queue_system', 'v1_monolithic')
    if queue_system == 'v1_monolithic':
        queue_base_name = params['queue_base_name']
    elif queue_system == 'v2_separated_auth':
        queue_base_name = 'queue2_auth'
    elif queue_system == 'v2_separated_dl':
        queue_base_name = 'queue2_dl'
    else:
        raise ValueError(f"Invalid queue_system: {queue_system}")
    logger.info(f"Operating on queue system '{queue_system}' with base name '{queue_base_name}'.")
    queues_to_clear_options = params.get('queues_to_clear_options', [])
    confirm_clear = params.get('confirm_clear', False)
    dump_queues = params['dump_queues']
@ -386,50 +397,77 @@ def check_status_callable(**context):
    ti = context['task_instance']
    logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
    redis_conn_id = params['redis_conn_id']
-    queue_name = params.get('queue_base_name', DEFAULT_QUEUE_NAME)
+    queue_system = params.get('queue_system', 'v1_monolithic')
-    queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
+    
    queue_base_names_to_check = []
    if queue_system == 'v1_monolithic':
        queue_base_names_to_check.append(params.get('queue_base_name', DEFAULT_QUEUE_NAME))
    elif queue_system.startswith('v2_'):
        # For v2, always check both auth and dl queues for a complete picture.
        queue_base_names_to_check.extend(['queue2_auth', 'queue2_dl'])
    else:
        raise ValueError(f"Invalid queue_system: {queue_system}")
-    logger.info(f"--- Checking Status for Queues with Base Name: '{queue_name}' ---")
+    queue_suffixes = ['_inbox', '_progress', '_result', '_fail']
    logger.info(f"--- Checking Status for Queue System: '{queue_system}' ---")
    try:
        redis_client = _get_redis_client(redis_conn_id)
-        for suffix in queue_suffixes:
+        for queue_name in queue_base_names_to_check:
-            queue_to_check = f"{queue_name}{suffix}"
+            logger.info(f"--- Base Name: '{queue_name}' ---")
-            key_type = redis_client.type(queue_to_check).decode('utf-8')
+            for suffix in queue_suffixes:
-            size = 0
+                queue_to_check = f"{queue_name}{suffix}"
-            if key_type == 'list':
+                key_type = redis_client.type(queue_to_check).decode('utf-8')
-                size = redis_client.llen(queue_to_check)
+                size = 0
-            elif key_type == 'hash':
+                if key_type == 'list':
-                size = redis_client.hlen(queue_to_check)
+                    size = redis_client.llen(queue_to_check)
                elif key_type == 'hash':
                    size = redis_client.hlen(queue_to_check)
-            if key_type != 'none':
+                if key_type != 'none':
-                logger.info(f"  - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
+                    logger.info(f"  - Queue '{queue_to_check}': Type='{key_type.upper()}', Size={size}")
-            else:
+                else:
-                logger.info(f"  - Queue '{queue_to_check}': Does not exist.")
+                    logger.info(f"  - Queue '{queue_to_check}': Does not exist.")
        logger.info(f"--- End of Status Check ---")
    except Exception as e:
-        logger.error(f"Failed to check queue status for base name '{queue_name}': {e}", exc_info=True)
+        logger.error(f"Failed to check queue status for system '{queue_system}': {e}", exc_info=True)
        raise AirflowException(f"Failed to check queue status: {e}")
 def requeue_failed_callable(**context):
    """
    Copies all URLs from the fail hash to the inbox list and optionally clears the fail hash.
    Adapts behavior for v1 and v2 queue systems.
    """
    params = context['params']
    ti = context['task_instance']
    logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
    redis_conn_id = params['redis_conn_id']
    queue_name = params['queue_base_name']
    clear_fail_queue = params['clear_fail_queue_after_requeue']
    queue_system = params.get('queue_system', 'v1_monolithic')
-    fail_queue_name = f"{queue_name}_fail"
+    fail_queue_name = ""
-    inbox_queue_name = f"{queue_name}_inbox"
+    inbox_queue_name = ""
-    logger.info(f"Requeuing failed URLs from '{fail_queue_name}' to '{inbox_queue_name}'.")
+    if queue_system == 'v1_monolithic':
        queue_name = params['queue_base_name']
        fail_queue_name = f"{queue_name}_fail"
        inbox_queue_name = f"{queue_name}_inbox"
    elif queue_system == 'v2_separated_auth':
        fail_queue_name = "queue2_auth_fail"
        inbox_queue_name = "queue2_auth_inbox"
    elif queue_system == 'v2_separated_dl':
        fail_queue_name = "queue2_dl_fail"
        # DL failures must be re-authenticated, so they go back to the auth inbox.
        inbox_queue_name = "queue2_auth_inbox"
    else:
        raise ValueError(f"Invalid queue_system: {queue_system}")
    logger.info(f"Requeuing failed URLs from '{fail_queue_name}' to '{inbox_queue_name}' (system: {queue_system}).")
    redis_client = _get_redis_client(redis_conn_id)
@ -478,7 +516,15 @@ def add_videos_to_queue_callable(**context):
    params = context["params"]
    ti = context['task_instance']
    logger.info(f"Task '{ti.task_id}' running on queue '{ti.queue}'.")
-    queue_name = params["queue_base_name"]
+    
    queue_system = params.get('queue_system', 'v1_monolithic')
    if queue_system.startswith('v2_'):
        # For v2 systems, raw URLs are always added to the auth queue.
        queue_name = 'queue2_auth'
        logger.info(f"Queue system is '{queue_system}'. Adding URLs to '{queue_name}_inbox'.")
    else:
        queue_name = params["queue_base_name"]
    redis_conn_id = params["redis_conn_id"]
    dry_run = params["dry_run"]
@ -565,11 +611,18 @@ with DAG(
            title="Action",
            description="The management action to perform.",
        ),
        "queue_system": Param(
            "v1_monolithic",
            type="string",
            enum=["v1_monolithic", "v2_separated_auth", "v2_separated_dl"],
            title="Queue System",
            description="Select the target queue system to manage. This choice affects which queues are targeted by actions.",
        ),
        "queue_base_name": Param(
            DEFAULT_QUEUE_NAME,
            type="string",
-            title="Queue Base Name",
+            title="Queue Base Name (v1 only)",
-            description="Base name for queues used in actions like 'add_videos', 'check_status', 'clear_queue', 'requeue_failed'.",
+            description="Base name for queues. Only used when 'Queue System' is 'v1_monolithic'.",
        ),
        # --- Params for 'add_videos' ---
        "input_source": Param(
@ -644,7 +697,7 @@ with DAG(
        ),
        # --- Params for 'list_contents' ---
        "queue_to_list": Param(
-            'video_queue_inbox,video_queue_result,video_queue_fail',
+            'video_queue_inbox,queue2_auth_inbox,queue2_dl_result',
            type="string",
            title="[list_contents] Queues to List",
            description="Comma-separated list of exact Redis key names to list.",
--- a/airflow/dags/ytdlp_ops_account_maintenance.py
+++ b/airflow/dags/ytdlp_ops_account_maintenance.py
@ -22,6 +22,7 @@ from datetime import datetime
 from airflow.decorators import task
 from airflow.models import Variable
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.utils.dates import days_ago
 # Import utility functions and Thrift modules
@ -42,7 +43,7 @@ DEFAULT_ARGS = {
    'owner': 'airflow',
    'retries': 1,
    'retry_delay': 30,
-    'queue': 'maintenance',
+    'queue': 'default',
 }
@ -61,38 +62,76 @@ def _get_thrift_client(host, port, timeout=60):
@task
-def manage_account_states():
+def manage_account_states(**context):
    """
-    Fetches all account statuses and performs necessary state transitions.
+    Fetches all account statuses and performs necessary state transitions
    based on time durations configured in the DAG parameters.
    """
    params = context['params']
    requests_limit = params['account_requests_limit']
    cooldown_duration_s = params['account_cooldown_duration_min'] * 60
    ban_duration_s = params['account_ban_duration_hours'] * 3600
    host = DEFAULT_YT_AUTH_SERVICE_IP
    port = int(DEFAULT_YT_AUTH_SERVICE_PORT)
    redis_conn_id = DEFAULT_REDIS_CONN_ID
    logger.info(f"Starting account maintenance. Service: {host}:{port}, Redis: {redis_conn_id}")
    logger.info(f"Using limits: Requests={requests_limit}, Cooldown={params['account_cooldown_duration_min']}m, Ban={params['account_ban_duration_hours']}h")
    client, transport = None, None
    try:
        client, transport = _get_thrift_client(host, port)
        redis_client = _get_redis_client(redis_conn_id)
-        logger.info("Fetching all account statuses from the service...")
+        logger.info(f"--- Step 1: Fetching all account statuses from the ytdlp-ops-server at {host}:{port}... ---")
-        all_accounts = client.getAccountStatus(accountPrefix=None)
+        all_accounts = client.getAccountStatus(accountId=None, accountPrefix=None)
-        logger.info(f"Found {len(all_accounts)} accounts to process.")
+        logger.info(f"Found {len(all_accounts)} total accounts to process.")
        accounts_to_unban = []
        accounts_to_activate = []
        accounts_to_rest = []
        now_ts = int(time.time())
        for acc in all_accounts:
-            if acc.status == "BANNED (expired)":
+            # Thrift can return 0 for unset integer fields.
            # The AccountStatus thrift object is missing status_changed_timestamp and active_since_timestamp.
            # We use available timestamps as proxies.
            last_failure_ts = int(acc.lastFailureTimestamp or 0)
            last_success_ts = int(acc.lastSuccessTimestamp or 0)
            last_usage_ts = max(last_failure_ts, last_success_ts)
            if acc.status == "BANNED" and last_failure_ts > 0 and (now_ts - last_failure_ts) >= ban_duration_s:
                accounts_to_unban.append(acc.accountId)
-            elif acc.status == "RESTING (expired)":
+            elif acc.status == "RESTING" and last_usage_ts > 0 and (now_ts - last_usage_ts) >= cooldown_duration_s:
                accounts_to_activate.append(acc.accountId)
-            elif acc.status == "ACTIVE (should be resting)":
+            elif acc.status == "ACTIVE":
-                accounts_to_rest.append(acc.accountId)
+                # For ACTIVE -> RESTING, check how many requests have been made since activation.
                count_at_activation_raw = redis_client.hget(f"account_status:{acc.accountId}", "success_count_at_activation")
                if count_at_activation_raw is not None:
                    count_at_activation = int(count_at_activation_raw)
                    current_success_count = acc.successCount or 0
                    requests_made = current_success_count - count_at_activation
                    if requests_made >= requests_limit:
                        logger.info(f"Account {acc.accountId} reached request limit ({requests_made}/{requests_limit}). Moving to RESTING.")
                        accounts_to_rest.append(acc.accountId)
                else:
                    # This is a fallback for accounts that were activated before this logic was deployed.
                    # We can activate them "fresh" by setting their baseline count now.
                    logger.info(f"Account {acc.accountId} is ACTIVE but has no 'success_count_at_activation'. Setting it now.")
                    redis_client.hset(f"account_status:{acc.accountId}", "success_count_at_activation", acc.successCount or 0)
        logger.info("--- Step 2: Analyzing accounts for state transitions ---")
        logger.info(f"Found {len(accounts_to_unban)} accounts with expired bans to un-ban.")
        logger.info(f"Found {len(accounts_to_activate)} accounts with expired rest periods to activate.")
        logger.info(f"Found {len(accounts_to_rest)} accounts with expired active periods to put to rest.")
        # --- Perform State Transitions ---
        # 1. Un-ban accounts via Thrift call
        logger.info("--- Step 3: Processing un-bans ---")
        if accounts_to_unban:
            logger.info(f"Un-banning {len(accounts_to_unban)} accounts: {accounts_to_unban}")
            for acc_id in accounts_to_unban:
@ -101,21 +140,30 @@ def manage_account_states():
                    logger.info(f"Successfully un-banned account '{acc_id}'.")
                except Exception as e:
                    logger.error(f"Failed to un-ban account '{acc_id}': {e}")
        else:
            logger.info("No accounts to un-ban.")
        # 2. Activate resting accounts via direct Redis write
        logger.info("--- Step 4: Processing activations ---")
        if accounts_to_activate:
            logger.info(f"Activating {len(accounts_to_activate)} accounts: {accounts_to_activate}")
            now_ts = int(time.time())
            account_map = {acc.accountId: acc for acc in all_accounts}
            with redis_client.pipeline() as pipe:
                for acc_id in accounts_to_activate:
                    key = f"account_status:{acc_id}"
                    current_success_count = account_map[acc_id].successCount or 0
                    pipe.hset(key, "status", "ACTIVE")
                    pipe.hset(key, "active_since_timestamp", now_ts)
                    pipe.hset(key, "status_changed_timestamp", now_ts)
                    pipe.hset(key, "success_count_at_activation", current_success_count)
                pipe.execute()
            logger.info("Finished activating accounts.")
        else:
            logger.info("No accounts to activate.")
        # 3. Rest active accounts via direct Redis write
        logger.info("--- Step 5: Processing rests ---")
        if accounts_to_rest:
            logger.info(f"Putting {len(accounts_to_rest)} accounts to rest: {accounts_to_rest}")
            now_ts = int(time.time())
@ -124,8 +172,13 @@ def manage_account_states():
                    key = f"account_status:{acc_id}"
                    pipe.hset(key, "status", "RESTING")
                    pipe.hset(key, "status_changed_timestamp", now_ts)
                    pipe.hdel(key, "success_count_at_activation")
                pipe.execute()
            logger.info("Finished putting accounts to rest.")
        else:
            logger.info("No accounts to put to rest.")
        logger.info("--- Account maintenance run complete. ---")
    finally:
        if transport and transport.isOpen():
@ -139,6 +192,47 @@ with DAG(
    start_date=days_ago(1),
    catchup=False,
    tags=['ytdlp', 'maintenance'],
-    doc_md=__doc__,
+    doc_md="""
    ### YT-DLP Account Maintenance: Time-Based State Transitions
    This DAG is the central authority for automated, **time-based** state management for ytdlp-ops accounts.
    It runs periodically to fetch the status of all accounts and applies its own logic to determine if an account's state should change based on configurable time durations.
    The thresholds are defined as DAG parameters and can be configured via the Airflow UI:
    - **Requests Limit**: How many successful requests an account can perform before it needs to rest.
    - **Cooldown Duration**: How long an account must rest before it can be used again.
    - **Ban Duration**: How long a ban lasts before the account is automatically un-banned.
    ---
    #### Separation of Concerns: Time vs. Errors
    It is critical to understand that this DAG primarily handles time-based state changes. Error-based banning may be handled by worker DAGs during URL processing. This separation ensures that maintenance is predictable and based on timers, while acute, error-driven actions are handled immediately by the workers that encounter them.
    ---
    #### State Transitions Performed by This DAG:
    On each run, this DAG fetches the raw status and timestamps for all accounts and performs the following checks:
    1.  **Un-banning (`BANNED` -> `ACTIVE`)**:
        - **Condition**: An account has been in the `BANNED` state for longer than the configured `account_ban_duration_hours`.
        - **Action**: The DAG calls the `unbanAccount` service endpoint to lift the ban.
    2.  **Activation (`RESTING` -> `ACTIVE`)**:
        - **Condition**: An account has been in the `RESTING` state for longer than the configured `account_cooldown_duration_min`.
        - **Action**: The DAG updates the account's status to `ACTIVE` directly in Redis.
    3.  **Resting (`ACTIVE` -> `RESTING`)**:
        - **Condition**: An account has performed more successful requests than the configured `account_requests_limit` since it was last activated.
        - **Action**: The DAG updates the account's status to `RESTING` directly in Redis.
    This process gives full control over time-based account lifecycle management to the Airflow orchestrator.
    """,
    params={
        'account_requests_limit': Param(250, type="integer", description="Number of successful requests an account can make before it is rested."),
        'account_cooldown_duration_min': Param(60, type="integer", description="Duration in minutes an account must rest before being activated again. Default is 1 hour."),
        'account_ban_duration_hours': Param(24, type="integer", description="Duration in hours an account stays banned before it can be un-banned."),
    }
 ) as dag:
    manage_account_states()
--- a/airflow/dags/ytdlp_ops_v01_dispatcher.py
+++ b/airflow/dags/ytdlp_ops_v01_dispatcher.py
@ -72,16 +72,16 @@ def dispatch_url_to_worker(**context):
    # The hook will parse the queue name from the run_id itself.
    run_id = f"worker_run_{context['dag_run'].run_id}_{context['ts_nodash']}_q_{worker_queue}"
-    logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}'")
+    logger.info(f"Triggering 'ytdlp_ops_v01_worker_per_url' with run_id '{run_id}'")
    trigger_dag(
-        dag_id='ytdlp_ops_worker_per_url',
+        dag_id='ytdlp_ops_v01_worker_per_url',
        run_id=run_id,
        conf=conf_to_pass,
        replace_microseconds=False
    )
 with DAG(
-    dag_id='ytdlp_ops_dispatcher',
+    dag_id='ytdlp_ops_v01_dispatcher',
    default_args={'owner': 'airflow', 'retries': 0},
    schedule=None, # This DAG is only triggered by the orchestrator.
    start_date=days_ago(1),
@ -94,10 +94,10 @@ with DAG(
    1. It pulls a single URL from the Redis `_inbox` queue.
    2. It runs on the generic `queue-dl` to find any available worker.
    3. It determines the worker's hostname and constructs a dedicated queue name (e.g., `queue-dl-dl-worker-1`).
-    4. It triggers the `ytdlp_ops_worker_per_url` DAG, passing the URL and the dedicated queue name in the configuration.
+    4. It triggers the `ytdlp_ops_v01_worker_per_url` DAG, passing the URL and the dedicated queue name in the configuration.
    This dispatcher-led affinity, combined with the `task_instance_mutation_hook` cluster policy, ensures that all subsequent processing for that URL happens on the same machine.
-    The `ytdlp_ops_orchestrator` is used to trigger a batch of these dispatcher runs.
+    The `ytdlp_ops_v01_orchestrator` is used to trigger a batch of these dispatcher runs.
    """,
    # All params are passed through from the orchestrator
    render_template_as_native_obj=True,
--- a/airflow/dags/ytdlp_ops_v01_orchestrator.py
+++ b/airflow/dags/ytdlp_ops_v01_orchestrator.py
@ -0,0 +1,444 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 DAG to orchestrate ytdlp_ops_dispatcher DAG runs based on a defined policy.
 It fetches URLs from a Redis queue and launches dispatchers in controlled bunches,
 which in turn trigger workers with affinity.
 """
 from airflow import DAG
 from airflow.exceptions import AirflowException, AirflowSkipException
 from airflow.operators.python import PythonOperator
 from airflow.models.param import Param
 from airflow.models.variable import Variable
 from airflow.utils.dates import days_ago
 from airflow.api.common.trigger_dag import trigger_dag
 from airflow.models.dagrun import DagRun
 from airflow.models.dag import DagModel
 from datetime import timedelta
 import logging
 import random
 import time
 import json
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # Import Thrift modules for proxy status check
 from pangramia.yt.tokens_ops import YTTokenOpService
 from thrift.protocol import TBinaryProtocol
 from thrift.transport import TSocket, TTransport
 # Configure logging
 logger = logging.getLogger(__name__)
 DEFAULT_REQUEST_PARAMS_JSON = """{
  "context_reuse_policy": {
    "enabled": true,
    "max_age_seconds": 86400,
    "reuse_visitor_id": true,
    "reuse_cookies": true
  },
  "token_generation_strategy": {
    "youtubei_js": {
      "generate_po_token": true,
      "generate_gvs_token": true
    }
  },
  "ytdlp_params": {
    "use_curl_prefetch": false,
    "token_supplement_strategy": {
      "youtubepot_bgutilhttp_extractor": {
        "enabled": true
      }
    },
    "visitor_id_override": {
      "enabled": true
    }
  },
  "session_params": {
    "lang": "en-US",
    "location": "US",
    "deviceCategory": "MOBILE",
    "user_agents": {
      "youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
      "yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
    }
  }
 }"""
 # Default settings
 DEFAULT_QUEUE_NAME = 'video_queue'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_TOTAL_WORKERS = 3
 DEFAULT_WORKERS_PER_BUNCH = 1
 DEFAULT_WORKER_DELAY_S = 5
 DEFAULT_BUNCH_DELAY_S = 20
 DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
 DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
 # --- Helper Functions ---
 def _check_application_queue(redis_client, queue_base_name: str) -> int:
    """Checks and logs the length of the application's inbox queue."""
    inbox_queue_name = f"{queue_base_name}_inbox"
    logger.info(f"--- Checking Application Work Queue ---")
    try:
        q_len = redis_client.llen(inbox_queue_name)
        logger.info(f"Application work queue '{inbox_queue_name}' has {q_len} item(s).")
        return q_len
    except Exception as e:
        logger.error(f"Failed to check application queue '{inbox_queue_name}': {e}", exc_info=True)
        return -1 # Indicate an error
 def _inspect_celery_queues(redis_client, queue_names: list):
    """Inspects Celery queues in Redis and logs their status."""
    logger.info("--- Inspecting Celery Queues in Redis ---")
    for queue_name in queue_names:
        try:
            q_len = redis_client.llen(queue_name)
            logger.info(f"Queue '{queue_name}': Length = {q_len}")
            if q_len > 0:
                logger.info(f"Showing up to 10 tasks in '{queue_name}':")
                # Fetch up to 10 items from the start of the list (queue)
                items_bytes = redis_client.lrange(queue_name, 0, 9)
                for i, item_bytes in enumerate(items_bytes):
                    try:
                        # Celery tasks are JSON-encoded strings
                        task_data = json.loads(item_bytes.decode('utf-8'))
                        # Pretty print for readability in logs
                        pretty_task_data = json.dumps(task_data, indent=2)
                        logger.info(f"  Task {i+1}:\n{pretty_task_data}")
                    except (json.JSONDecodeError, UnicodeDecodeError) as e:
                        logger.warning(f"  Task {i+1}: Could not decode/parse task data. Error: {e}. Raw: {item_bytes!r}")
        except Exception as e:
            logger.error(f"Failed to inspect queue '{queue_name}': {e}", exc_info=True)
    logger.info("--- End of Queue Inspection ---")
 # --- Main Orchestration Callable ---
 def orchestrate_workers_ignition_callable(**context):
    """
    Main orchestration logic. Triggers a specified number of dispatcher DAGs
    to initiate self-sustaining processing loops.
    """
    params = context['params']
    ti = context['task_instance']
    logger.info(f"Orchestrator task '{ti.task_id}' running on queue '{ti.queue}'.")
    logger.info("Starting dispatcher ignition sequence.")
    dispatcher_dag_id = 'ytdlp_ops_v01_dispatcher'
    worker_queue = 'queue-dl'
    app_queue_name = params['queue_name']
    logger.info(f"Running in v1 (monolithic) mode. Dispatcher DAG: '{dispatcher_dag_id}', Worker Queue: '{worker_queue}'")
    dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
    if dag_model and dag_model.is_paused:
        logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Skipping dispatcher ignition.")
        raise AirflowSkipException(f"Dispatcher DAG '{dispatcher_dag_id}' is paused.")
    total_workers = int(params['total_workers'])
    workers_per_bunch = int(params['workers_per_bunch'])
    # --- Input Validation ---
    if total_workers <= 0:
        logger.warning(f"'total_workers' is {total_workers}. No workers will be started. Skipping ignition.")
        raise AirflowSkipException(f"No workers to start (total_workers={total_workers}).")
    if workers_per_bunch <= 0:
        logger.error(f"'workers_per_bunch' must be a positive integer, but got {workers_per_bunch}. Aborting.")
        raise AirflowException(f"'workers_per_bunch' must be a positive integer, but got {workers_per_bunch}.")
    # --- End Input Validation ---
    worker_delay = int(params['delay_between_workers_s'])
    bunch_delay = int(params['delay_between_bunches_s'])
    # Create a list of worker numbers to trigger
    worker_indices = list(range(total_workers))
    bunches = [worker_indices[i:i + workers_per_bunch] for i in range(0, len(worker_indices), workers_per_bunch)]
    # --- Inspect Queues before starting ---
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        redis_client = _get_redis_client(redis_conn_id)
        # First, check the application queue for work
        app_queue_len = _check_application_queue(redis_client, app_queue_name)
        if params.get('skip_if_queue_empty') and app_queue_len == 0:
            logger.info("'skip_if_queue_empty' is True and application queue is empty. Skipping worker ignition.")
            raise AirflowSkipException("Application work queue is empty.")
        # Then, inspect the target Celery queue for debugging
        _inspect_celery_queues(redis_client, [worker_queue])
    except AirflowSkipException:
        raise # Re-raise to let Airflow handle the skip
    except Exception as e:
        logger.error(f"Could not inspect queues due to an error: {e}. Continuing with ignition sequence.")
    # --- End of Inspection ---
    logger.info(f"Plan: Triggering {total_workers} total dispatcher runs in {len(bunches)} bunches. Each run will attempt to process one URL.")
    dag_run_id = context['dag_run'].run_id
    total_triggered = 0
    for i, bunch in enumerate(bunches):
        logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
        for j, _ in enumerate(bunch):
            # Create a unique run_id for each dispatcher run
            run_id = f"dispatched_{dag_run_id}_{total_triggered}"
            # Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
            conf_to_pass = {p: params[p] for p in params}
            logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
            logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
            trigger_dag(
                dag_id=dispatcher_dag_id,
                run_id=run_id,
                conf=conf_to_pass,
                replace_microseconds=False
            )
            total_triggered += 1
            # Delay between dispatches in a bunch
            if j < len(bunch) - 1:
                logger.info(f"Waiting {worker_delay}s before next dispatcher in bunch...")
                time.sleep(worker_delay)
        # Delay between bunches
        if i < len(bunches) - 1:
            logger.info(f"--- Bunch {i+1} triggered. Waiting {bunch_delay}s before next bunch... ---")
            time.sleep(bunch_delay)
    logger.info(f"--- Ignition sequence complete. Total dispatcher runs triggered: {total_triggered}. ---")
    # --- Final Queue Inspection ---
    final_check_delay = 30  # seconds
    logger.info(f"Waiting {final_check_delay}s for a final queue status check to see if workers picked up tasks...")
    time.sleep(final_check_delay)
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        redis_client = _get_redis_client(redis_conn_id)
        # Log connection details for debugging broker mismatch issues
        conn_kwargs = redis_client.connection_pool.connection_kwargs
        logger.info(f"Final check using Redis connection '{redis_conn_id}': "
                    f"host={conn_kwargs.get('host')}, "
                    f"port={conn_kwargs.get('port')}, "
                    f"db={conn_kwargs.get('db')}")
        _inspect_celery_queues(redis_client, [worker_queue])
        logger.info("Final queue inspection complete. If queues are not empty, workers have not picked up tasks yet. "
                    "If queues are empty, workers have started processing.")
    except Exception as e:
        logger.error(f"Could not perform final queue inspection: {e}. This does not affect worker ignition.")
 # =============================================================================
 # DAG Definition
 # =============================================================================
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'start_date': days_ago(1),
 }
 with DAG(
    dag_id='ytdlp_ops_v01_orchestrator',
    default_args=default_args,
    schedule=None, # This DAG runs only when triggered.
    max_active_runs=1, # Only one ignition process should run at a time.
    catchup=False,
    description='Ignition system for ytdlp_ops_v01_dispatcher DAGs. Starts self-sustaining worker loops via dispatchers.',
    doc_md="""
    ### YT-DLP v1 (Monolithic) Worker Ignition System
    This DAG acts as an "ignition system" to start one or more self-sustaining worker loops for the **v1 monolithic worker**.
    It does **not** process URLs itself. Its only job is to trigger a specified number of `ytdlp_ops_v01_dispatcher` DAGs,
    which in turn pull URLs and trigger `ytdlp_ops_v01_worker_per_url` with worker affinity.
    #### How it Works:
    1.  **Manual Trigger:** You manually trigger this DAG with parameters defining how many dispatcher loops to start (`total_workers`), in what configuration (`workers_per_bunch`, delays).
    2.  **Ignition:** The orchestrator triggers the initial set of dispatcher DAGs in a "fire-and-forget" manner, passing all its configuration parameters to them.
    3.  **Completion:** Once all initial dispatchers have been triggered, the orchestrator's job is complete.
    The dispatchers then take over, each pulling a URL, determining affinity, and triggering a worker DAG.
    #### Client Selection (`clients` parameter):
    The `clients` parameter determines which YouTube client persona is used for token generation. Different clients have different capabilities and requirements.
    **Supported Clients:**
    | Client           | Visitor ID   | Player poToken | GVS poToken  | Cookies Support | Notes                                                              |
    | ---------------- | ------------ | -------------- | ------------ | --------------- | ------------------------------------------------------------------ |
    | `tv`             | Required     | Not Required   | Not Required | Supported       | All formats may have DRM if you request too much.                  |
    | `web_safari`     | Required     | Required       | Required*    | Supported       | *Provides HLS (m3u8) formats which may not require a GVS token.    |
    | `mweb`           | Required     | Required       | Required     | Supported       |                                                                    |
    | `web_camoufox`   | Required     | Required       | Required     | Supported       | Camoufox variant of `web`.                                         |
    **Untested / Not Recommended Clients:**
    | Client           | Visitor ID   | Player poToken | GVS poToken  | Cookies Support | Notes                                                              |
    | ---------------- | ------------ | -------------- | ------------ | --------------- | ------------------------------------------------------------------ |
    | `web`            | Required     | Required       | Required     | Supported       | Only SABR formats available.                                       |
    | `tv_simply`      | Required     | Not Required   | Not Required | Not Supported   |                                                                    |
    | `tv_embedded`    | Required     | Not Required   | Not Required | Supported       | Requires account cookies for most videos.                          |
    | `web_embedded`   | Required     | Not Required   | Not Required | Supported       | Only for embeddable videos.                                        |
    | `web_music`      | Required     | Required       | Required     | Supported       |                                                                    |
    | `web_creator`    | Required     | Required       | Required     | Supported       | Requires account cookies.                                          |
    | `android`        | Required     | Required       | Required     | Not Supported   |                                                                    |
    | `android_vr`     | Required     | Not Required   | Not Required | Not Supported   | YouTube Kids videos are not available.                             |
    | `ios`            | Required     | Required       | Required     | Not Supported   |                                                                    |
    Other `_camoufox` variants are also available but untested.
    """,
    tags=['ytdlp', 'mgmt', 'master'],
    params={
        # --- Ignition Control Parameters ---
        'total_workers': Param(DEFAULT_TOTAL_WORKERS, type="integer", description="Total number of dispatcher loops to start."),
        'workers_per_bunch': Param(DEFAULT_WORKERS_PER_BUNCH, type="integer", description="Number of dispatchers to start in each bunch."),
        'delay_between_workers_s': Param(DEFAULT_WORKER_DELAY_S, type="integer", description="Delay in seconds between starting each dispatcher within a bunch."),
        'delay_between_bunches_s': Param(DEFAULT_BUNCH_DELAY_S, type="integer", description="Delay in seconds between starting each bunch."),
        'skip_if_queue_empty': Param(False, type="boolean", title="[Ignition Control] Skip if Queue Empty", description="If True, the orchestrator will not start any dispatchers if the application's work queue is empty."),
        # --- Worker Passthrough Parameters ---
        'on_auth_failure': Param(
            'retry_with_new_account',
            type="string",
            enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'proceed_loop_under_manual_inspection'],
            title="[Worker Param] On Authentication Failure Policy",
            description="Policy for a worker when a bannable authentication error occurs. "
                        "'stop_loop': Ban the account, mark URL as failed, and stop the worker's loop. "
                        "'retry_with_new_account': (Default) Ban the failed account, retry ONCE with a new account. If retry fails, ban the second account and stop."
                        "'retry_without_ban': If a connection error (e.g. SOCKS timeout) occurs, retry with a new account but do NOT ban the first account/proxy. If retry fails, stop the loop without banning."
                        "'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene."
        ),
        'on_download_failure': Param(
            'proceed_loop',
            type="string",
            enum=['stop_loop', 'proceed_loop', 'retry_with_new_token'],
            title="[Worker Param] On Download Failure Policy",
            description="Policy for a worker when a download or probe error occurs. "
                        "'stop_loop': Mark URL as failed and stop the worker's loop. "
                        "'proceed_loop': (Default) Mark URL as failed but continue the processing loop with a new URL. "
                        "'retry_with_new_token': Attempt to get a new token with a new account and retry the download once. If it fails again, proceed loop."
        ),
        'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}),
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
        'clients': Param(
            'mweb,web_camoufox,tv',
            type="string",
            enum=[
                'mweb,web_camoufox,tv',
                'mweb',
                'web_camoufox',
                'tv',
                'custom',
                'tv,web_safari,mweb,web_camoufox',
                'web_safari',
                'web',
                'web_embedded',
                'web_music',
                'web_creator',
                'web_safari_camoufox',
                'web_embedded_camoufox',
                'web_music_camoufox',
                'web_creator_camoufox',
                'mweb_camoufox',
                'android',
                'android_music',
                'android_creator',
                'android_vr',
                'ios',
                'ios_music',
                'ios_creator',
                'tv_simply',
                'tv_embedded',
            ],
            title="[Worker Param] Clients",
            description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
        ),
        'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."),
        'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
        'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode. Format: prefix_YYYYMMDDHHMMSS_client_XX."),
        'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),
        'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="[Worker Param] Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."),
        'machine_id': Param("ytdlp-ops-airflow-service", type="string", description="[Worker Param] Identifier for the client machine."),
        'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="A specific proxy URL to use for the request, overriding the server's proxy pool logic."),
        'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="[Worker Param] If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."),
        # --- Download Control Parameters ---
        'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
        'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
        'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
        'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
        'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
        'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
        'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
        'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
        'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
        'download_format_preset': Param(
            'formats_2',
            type="string",
            enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
            title="[Worker Param] Download Format Preset",
            description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
        ),
        'download_format_custom': Param(
            '18,140,299/298/137/136/135/134/133',
            type="string",
            title="[Worker Param] Custom Download Format",
            description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
        ),
        'downloader': Param(
            'default',
            type="string",
            enum=['default', 'aria2c'],
            title="[Worker Param] Downloader",
            description="Choose the downloader for yt-dlp."
        ),
        'downloader_args_aria2c': Param(
            'aria2c:-x 4 -k 2M --max-download-limit=3M',
            type="string",
            title="[Worker Param] Aria2c Downloader Arguments",
            description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
        ),
        'yt_dlp_extra_args': Param(
            '--restrict-filenames',
            type=["string", "null"],
            title="[Worker Param] Extra yt-dlp arguments",
            description="Extra command-line arguments for yt-dlp during download."
        ),
    }
 ) as dag:
    orchestrate_task = PythonOperator(
        task_id='start_worker_loops',
        python_callable=orchestrate_workers_ignition_callable,
    )
    orchestrate_task.doc_md = """
    ### Start Worker Loops
    This is the main task that executes the ignition policy.
    - It triggers `ytdlp_ops_dispatcher` DAGs according to the batch settings.
    - It passes all its parameters down to the dispatchers, which will use them to trigger workers.
    """
--- a/airflow/dags/ytdlp_ops_v01_worker_per_url.py
+++ b/airflow/dags/ytdlp_ops_v01_worker_per_url.py
--- a/airflow/dags/ytdlp_ops_v02_dispatcher_auth.py
+++ b/airflow/dags/ytdlp_ops_v02_dispatcher_auth.py
@ -0,0 +1,98 @@
 # -*- coding: utf-8 -*-
 """
 DAG to dispatch work to ytdlp_ops_worker_per_url_auth DAGs.
 It pulls a URL from Redis and triggers an auth worker with a pinned queue.
 """
 from __future__ import annotations
 import logging
 import os
 import socket
 from datetime import timedelta
 from airflow.decorators import task
 from airflow.exceptions import AirflowSkipException
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.api.common.trigger_dag import trigger_dag
 from airflow.utils.dates import days_ago
 from utils.redis_utils import _get_redis_client
 logger = logging.getLogger(__name__)
 DEFAULT_QUEUE_NAME = 'queue2_auth'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
@task(queue='queue-auth')
 def dispatch_url_to_auth_worker(**context):
    """
    Pulls one URL from Redis, determines the current worker's dedicated queue,
    and triggers the auth worker DAG to process the URL on that specific queue.
    """
    ti = context['task_instance']
    logger.info(f"Auth Dispatcher task '{ti.task_id}' running on queue '{ti.queue}'.")
    # --- Check for worker pause lock file ---
    lock_file_path = '/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile'
    hostname = socket.gethostname()
    if os.path.exists(lock_file_path):
        logger.info(f"Worker '{hostname}' is paused. Lock file found at '{lock_file_path}'. Skipping URL pull.")
        raise AirflowSkipException(f"Worker '{hostname}' is paused.")
    else:
        logger.info(f"Worker '{hostname}' is active (no lock file found at '{lock_file_path}'). Proceeding to pull URL.")
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_name = params['queue_name']
    inbox_queue = f"{queue_name}_inbox"
    logger.info(f"Attempting to pull one URL from Redis queue '{inbox_queue}'...")
    client = _get_redis_client(redis_conn_id)
    url_bytes = client.lpop(inbox_queue)
    if not url_bytes:
        logger.info("Redis auth inbox queue is empty. No work to dispatch. Skipping task.")
        raise AirflowSkipException("Redis auth inbox queue is empty. No work to dispatch.")
    url_to_process = url_bytes.decode('utf-8')
    logger.info(f"Pulled URL '{url_to_process}' from the queue.")
    # Determine the worker-specific queue for affinity
    hostname = socket.gethostname()
    worker_queue = f"queue-auth-{hostname}"
    logger.info(f"Running on worker '{hostname}'. Dispatching job to its dedicated queue '{worker_queue}'.")
    conf_to_pass = {**params, 'url_to_process': url_to_process, 'worker_queue': worker_queue}
    run_id = f"worker_run_auth_{context['dag_run'].run_id}_{context['ts_nodash']}_q_{worker_queue}"
    logger.info(f"Triggering 'ytdlp_ops_v02_worker_per_url_auth' with run_id '{run_id}'")
    trigger_dag(
        dag_id='ytdlp_ops_v02_worker_per_url_auth',
        run_id=run_id,
        conf=conf_to_pass,
        replace_microseconds=False
    )
 with DAG(
    dag_id='ytdlp_ops_v02_dispatcher_auth',
    default_args={'owner': 'airflow', 'retries': 0},
    schedule=None,
    start_date=days_ago(1),
    catchup=False,
    tags=['ytdlp', 'worker', 'dispatcher', 'auth'],
    is_paused_upon_creation=True,
    doc_md="""
    ### YT-DLP Auth URL Dispatcher
    This DAG dispatches a single URL to an auth worker with a pinned queue.
    It pulls from the `queue2_auth_inbox` Redis queue and triggers the `ytdlp_ops_v02_worker_per_url_auth` DAG.
    """,
    render_template_as_native_obj=True,
    params={
        'queue_name': Param(DEFAULT_QUEUE_NAME, type='string', title='Queue Name', description='The base name of the Redis queue to pull URLs from.'),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type='string', title='Redis Connection ID'),
    },
 ) as dag:
    dispatch_url_to_auth_worker()
--- a/airflow/dags/ytdlp_ops_v02_dispatcher_dl.py
+++ b/airflow/dags/ytdlp_ops_v02_dispatcher_dl.py
@ -0,0 +1,89 @@
 # -*- coding: utf-8 -*-
 """
 DAG to dispatch download jobs to ytdlp_ops_worker_per_url_dl DAGs.
 It pulls a job payload from Redis and triggers a download worker.
 """
 from __future__ import annotations
 import logging
 import os
 import socket
 from datetime import timedelta
 from airflow.decorators import task
 from airflow.exceptions import AirflowSkipException
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.api.common.trigger_dag import trigger_dag
 from airflow.utils.dates import days_ago
 from utils.redis_utils import _get_redis_client
 logger = logging.getLogger(__name__)
 DEFAULT_QUEUE_NAME = 'queue2_dl'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
@task(queue='queue-dl')
 def dispatch_job_to_dl_worker(**context):
    """
    Pulls one job payload from Redis, determines the current worker's dedicated queue,
    and triggers the download worker DAG to process the job on that specific queue.
    """
    ti = context['task_instance']
    logger.info(f"Download Dispatcher task '{ti.task_id}' running on queue '{ti.queue}'.")
    params = context['params']
    redis_conn_id = params['redis_conn_id']
    queue_name = params['queue_name']
    inbox_queue = f"{queue_name}_inbox"
    logger.info(f"Attempting to pull one job from Redis queue '{inbox_queue}'...")
    client = _get_redis_client(redis_conn_id)
    job_bytes = client.lpop(inbox_queue)
    if not job_bytes:
        logger.info("Redis download inbox queue is empty. No work to dispatch. Skipping task.")
        raise AirflowSkipException("Redis download inbox queue is empty. No work to dispatch.")
    job_data_str = job_bytes.decode('utf-8')
    logger.info(f"Pulled job from the queue.")
    # Determine the worker-specific queue for affinity
    hostname = socket.gethostname()
    worker_queue = f"queue-dl-{hostname}"
    logger.info(f"Running on worker '{hostname}'. Dispatching job to its dedicated queue '{worker_queue}'.")
    conf_to_pass = {**params, 'job_data': job_data_str, 'worker_queue': worker_queue}
    run_id = f"worker_run_dl_{context['dag_run'].run_id}_{context['ts_nodash']}_q_{worker_queue}"
    logger.info(f"Triggering 'ytdlp_ops_v02_worker_per_url_dl' with run_id '{run_id}'")
    trigger_dag(
        dag_id='ytdlp_ops_v02_worker_per_url_dl',
        run_id=run_id,
        conf=conf_to_pass,
        replace_microseconds=False
    )
 with DAG(
    dag_id='ytdlp_ops_v02_dispatcher_dl',
    default_args={'owner': 'airflow', 'retries': 0},
    schedule=None,
    start_date=days_ago(1),
    catchup=False,
    tags=['ytdlp', 'worker', 'dispatcher', 'download'],
    is_paused_upon_creation=True,
    doc_md="""
    ### YT-DLP Download Job Dispatcher
    This DAG dispatches a single download job to a download worker with a pinned queue.
    It pulls a JSON payload from the `queue2_dl_inbox` Redis queue and triggers the `ytdlp_ops_v02_worker_per_url_dl` DAG.
    """,
    render_template_as_native_obj=True,
    params={
        'queue_name': Param(DEFAULT_QUEUE_NAME, type='string', title='Queue Name', description='The base name of the Redis queue to pull job payloads from.'),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type='string', title='Redis Connection ID'),
    },
 ) as dag:
    dispatch_job_to_dl_worker()
--- a/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py
+++ b/airflow/dags/ytdlp_ops_v02_orchestrator_auth.py
@ -6,9 +6,7 @@
 # Distributed under terms of the MIT license.
 """
-DAG to orchestrate ytdlp_ops_dispatcher DAG runs based on a defined policy.
+DAG to orchestrate ytdlp_ops_dispatcher_v2_auth DAG runs based on a defined policy.
 It fetches URLs from a Redis queue and launches dispatchers in controlled bunches,
 which in turn trigger workers with affinity.
 """
 from airflow import DAG
@ -37,8 +35,42 @@ from thrift.transport import TSocket, TTransport
 # Configure logging
 logger = logging.getLogger(__name__)
 DEFAULT_REQUEST_PARAMS_JSON = """{
  "context_reuse_policy": {
    "enabled": true,
    "max_age_seconds": 86400,
    "reuse_visitor_id": true,
    "reuse_cookies": true
  },
  "token_generation_strategy": {
    "youtubei_js": {
      "generate_po_token": true,
      "generate_gvs_token": true
    }
  },
  "ytdlp_params": {
    "use_curl_prefetch": false,
    "token_supplement_strategy": {
      "youtubepot_bgutilhttp_extractor": {
        "enabled": true
      }
    },
    "visitor_id_override": {
      "enabled": true
    }
  },
  "session_params": {
    "lang": "en-US",
    "location": "US",
    "deviceCategory": "MOBILE",
    "user_agents": {
      "youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
      "yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
    }
  }
 }"""
 # Default settings
 DEFAULT_QUEUE_NAME = 'video_queue'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_TOTAL_WORKERS = 3
 DEFAULT_WORKERS_PER_BUNCH = 1
@ -100,7 +132,12 @@ def orchestrate_workers_ignition_callable(**context):
    logger.info(f"Orchestrator task '{ti.task_id}' running on queue '{ti.queue}'.")
    logger.info("Starting dispatcher ignition sequence.")
-    dispatcher_dag_id = 'ytdlp_ops_dispatcher'
+    dispatcher_dag_id = 'ytdlp_ops_v02_dispatcher_auth'
    worker_queue = 'queue-auth'
    app_queue_name = 'queue2_auth'
    logger.info(f"Running in v2 (auth) mode. Dispatcher DAG: '{dispatcher_dag_id}', Worker Queue: '{worker_queue}'")
    dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
    if dag_model and dag_model.is_paused:
        logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Skipping dispatcher ignition.")
@ -127,13 +164,12 @@ def orchestrate_workers_ignition_callable(**context):
    bunches = [worker_indices[i:i + workers_per_bunch] for i in range(0, len(worker_indices), workers_per_bunch)]
    # --- Inspect Queues before starting ---
    worker_queue = 'queue-dl' # The static queue the worker DAG uses.
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        redis_client = _get_redis_client(redis_conn_id)
        # First, check the application queue for work
-        app_queue_len = _check_application_queue(redis_client, params['queue_name'])
+        app_queue_len = _check_application_queue(redis_client, app_queue_name)
        if params.get('skip_if_queue_empty') and app_queue_len == 0:
            logger.info("'skip_if_queue_empty' is True and application queue is empty. Skipping worker ignition.")
@ -224,26 +260,17 @@ default_args = {
 }
 with DAG(
-    dag_id='ytdlp_ops_orchestrator',
+    dag_id='ytdlp_ops_v02_orchestrator_auth',
    default_args=default_args,
-    schedule_interval=None, # This DAG runs only when triggered.
+    schedule=None, # This DAG runs only when triggered.
    max_active_runs=1, # Only one ignition process should run at a time.
    catchup=False,
-    description='Ignition system for ytdlp_ops_dispatcher DAGs. Starts self-sustaining worker loops via dispatchers.',
+    description='Ignition system for ytdlp_ops_v02_dispatcher_auth DAGs.',
    doc_md="""
-    ### YT-DLP Worker Ignition System
+    ### YT-DLP v2 (Auth) Worker Ignition System
-    This DAG acts as an "ignition system" to start one or more self-sustaining worker loops.
+    This DAG acts as an "ignition system" to start one or more self-sustaining worker loops for the **v2 authentication worker**.
-    It does **not** process URLs itself. Its only job is to trigger a specified number of `ytdlp_ops_dispatcher` DAGs,
+    It triggers `ytdlp_ops_v02_dispatcher_auth` DAGs, which pull raw URLs from `queue2_auth_inbox` and trigger `ytdlp_ops_v02_worker_per_url_auth` workers.
    which in turn pull URLs and trigger `ytdlp_ops_worker_per_url` with worker affinity.
    #### How it Works:
    1.  **Manual Trigger:** You manually trigger this DAG with parameters defining how many dispatcher loops to start (`total_workers`), in what configuration (`workers_per_bunch`, delays).
    2.  **Ignition:** The orchestrator triggers the initial set of dispatcher DAGs in a "fire-and-forget" manner, passing all its configuration parameters to them.
    3.  **Completion:** Once all initial dispatchers have been triggered, the orchestrator's job is complete.
    The dispatchers then take over, each pulling a URL, determining affinity, and triggering a worker DAG.
    """,
    tags=['ytdlp', 'mgmt', 'master'],
    params={
@ -256,25 +283,60 @@ with DAG(
        # --- Worker Passthrough Parameters ---
        'on_bannable_failure': Param(
-            'stop_loop',
+            'stop_loop_on_auth_proceed_on_download_error',
            type="string",
-            enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error'],
+            enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error'],
            title="[Worker Param] On Bannable Failure Policy",
            description="Policy for a worker when a bannable error occurs. "
-                        "'stop_loop': Ban the account, mark URL as failed, and stop the worker's loop. "
+                        "'stop_loop': Ban the account, mark URL as failed, and stop the worker's loop on any failure (auth or download). "
                        "'retry_with_new_account': Ban the failed account, retry ONCE with a new account. If retry fails, ban the second account and proxy, then stop."
                        "'retry_on_connection_error': If a connection error (e.g. SOCKS timeout) occurs, retry with a new account but do NOT ban the first account/proxy. If retry fails, stop the loop without banning."
                        "'proceed_loop_under_manual_inspection': **BEWARE: MANUAL SUPERVISION REQUIRED.** Marks the URL as failed but continues the processing loop. Use this only when you can manually intervene by pausing the dispatcher DAG or creating a lock file (`/opt/airflow/inputfiles/AIRFLOW.PREVENT_URL_PULL.lockfile`) to prevent a runaway failure loop."
                        "'stop_loop_on_auth_proceed_on_download_error': **(Default)** Stops the loop on an authentication/token error (like 'stop_loop'), but continues the loop on a download/probe error (like 'proceed...')."
        ),
-        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
+        'request_params_json': Param(DEFAULT_REQUEST_PARAMS_JSON, type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service.", render_kwargs={"rows": 20, "cols": 120}),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
-        'clients': Param('tv_sample,mweb,web_camoufox', type="string", description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_sample, tv_embedded"),
+        'clients': Param(
            'mweb,web_camoufox,tv',
            type="string",
            enum=[
                'mweb,web_camoufox,tv',
                'mweb',
                'web_camoufox',
                'tv',
                'custom',
                'tv,web_safari,mweb,web_camoufox',
                'web_safari',
                'web',
                'web_embedded',
                'web_music',
                'web_creator',
                'web_safari_camoufox',
                'web_embedded_camoufox',
                'web_music_camoufox',
                'web_creator_camoufox',
                'mweb_camoufox',
                'android',
                'android_music',
                'android_creator',
                'android_vr',
                'ios',
                'ios_music',
                'ios_creator',
                'tv_simply',
                'tv_embedded',
            ],
            title="[Worker Param] Clients",
            description="[Worker Param] Comma-separated list of clients for token generation. Full list: web, web_safari, web_embedded, web_music, web_creator, mweb, web_camoufox, web_safari_camoufox, web_embedded_camoufox, web_music_camoufox, web_creator_camoufox, mweb_camoufox, android, android_music, android_creator, android_vr, ios, ios_music, ios_creator, tv, tv_simply, tv_embedded. See DAG documentation for details."
        ),
        'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."),
        'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
        'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode. Format: prefix_YYYYMMDDHHMMSS_client_XX."),
        'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),
        'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="[Worker Param] Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."),
        'machine_id': Param("ytdlp-ops-airflow-service", type="string", description="[Worker Param] Identifier for the client machine."),
        'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="If provided, forces the token service to use this specific proxy for the request."),
        'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="[Worker Param] If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."),
        'retrigger_delay_on_empty_s': Param(60, type="integer", description="[Worker Param] Delay in seconds before a worker re-triggers itself if the queue is empty. Set to -1 to stop the loop."),
    }
 ) as dag:
@ -285,6 +347,6 @@ with DAG(
    orchestrate_task.doc_md = """
    ### Start Worker Loops
    This is the main task that executes the ignition policy.
-    - It triggers `ytdlp_ops_dispatcher` DAGs according to the batch settings.
+    - It triggers `ytdlp_ops_v02_dispatcher_auth` DAGs according to the batch settings.
    - It passes all its parameters down to the dispatchers, which will use them to trigger workers.
    """
--- a/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py
+++ b/airflow/dags/ytdlp_ops_v02_orchestrator_dl.py
@ -0,0 +1,302 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 DAG to orchestrate ytdlp_ops_dispatcher_v2_dl DAG runs based on a defined policy.
 """
 from airflow import DAG
 from airflow.exceptions import AirflowException, AirflowSkipException
 from airflow.operators.python import PythonOperator
 from airflow.models.param import Param
 from airflow.models.variable import Variable
 from airflow.utils.dates import days_ago
 from airflow.api.common.trigger_dag import trigger_dag
 from airflow.models.dagrun import DagRun
 from airflow.models.dag import DagModel
 from datetime import timedelta
 import logging
 import random
 import time
 import json
 # Import utility functions
 from utils.redis_utils import _get_redis_client
 # Import Thrift modules for proxy status check
 from pangramia.yt.tokens_ops import YTTokenOpService
 from thrift.protocol import TBinaryProtocol
 from thrift.transport import TSocket, TTransport
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default settings
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_TOTAL_WORKERS = 3
 DEFAULT_WORKERS_PER_BUNCH = 1
 DEFAULT_WORKER_DELAY_S = 5
 DEFAULT_BUNCH_DELAY_S = 20
 # --- Helper Functions ---
 def _check_application_queue(redis_client, queue_base_name: str) -> int:
    """Checks and logs the length of the application's inbox queue."""
    inbox_queue_name = f"{queue_base_name}_inbox"
    logger.info(f"--- Checking Application Work Queue ---")
    try:
        q_len = redis_client.llen(inbox_queue_name)
        logger.info(f"Application work queue '{inbox_queue_name}' has {q_len} item(s).")
        return q_len
    except Exception as e:
        logger.error(f"Failed to check application queue '{inbox_queue_name}': {e}", exc_info=True)
        return -1 # Indicate an error
 def _inspect_celery_queues(redis_client, queue_names: list):
    """Inspects Celery queues in Redis and logs their status."""
    logger.info("--- Inspecting Celery Queues in Redis ---")
    for queue_name in queue_names:
        try:
            q_len = redis_client.llen(queue_name)
            logger.info(f"Queue '{queue_name}': Length = {q_len}")
            if q_len > 0:
                logger.info(f"Showing up to 10 tasks in '{queue_name}':")
                # Fetch up to 10 items from the start of the list (queue)
                items_bytes = redis_client.lrange(queue_name, 0, 9)
                for i, item_bytes in enumerate(items_bytes):
                    try:
                        # Celery tasks are JSON-encoded strings
                        task_data = json.loads(item_bytes.decode('utf-8'))
                        # Pretty print for readability in logs
                        pretty_task_data = json.dumps(task_data, indent=2)
                        logger.info(f"  Task {i+1}:\n{pretty_task_data}")
                    except (json.JSONDecodeError, UnicodeDecodeError) as e:
                        logger.warning(f"  Task {i+1}: Could not decode/parse task data. Error: {e}. Raw: {item_bytes!r}")
        except Exception as e:
            logger.error(f"Failed to inspect queue '{queue_name}': {e}", exc_info=True)
    logger.info("--- End of Queue Inspection ---")
 # --- Main Orchestration Callable ---
 def orchestrate_workers_ignition_callable(**context):
    """
    Main orchestration logic. Triggers a specified number of dispatcher DAGs
    to initiate self-sustaining processing loops.
    """
    params = context['params']
    ti = context['task_instance']
    logger.info(f"Orchestrator task '{ti.task_id}' running on queue '{ti.queue}'.")
    logger.info("Starting dispatcher ignition sequence.")
    dispatcher_dag_id = 'ytdlp_ops_v02_dispatcher_dl'
    worker_queue = 'queue-dl'
    app_queue_name = 'queue2_dl'
    logger.info(f"Running in v2 (download) mode. Dispatcher DAG: '{dispatcher_dag_id}', Worker Queue: '{worker_queue}'")
    dag_model = DagModel.get_dagmodel(dispatcher_dag_id)
    if dag_model and dag_model.is_paused:
        logger.warning(f"Dispatcher DAG '{dispatcher_dag_id}' is paused. Skipping dispatcher ignition.")
        raise AirflowSkipException(f"Dispatcher DAG '{dispatcher_dag_id}' is paused.")
    total_workers = int(params['total_workers'])
    workers_per_bunch = int(params['workers_per_bunch'])
    # --- Input Validation ---
    if total_workers <= 0:
        logger.warning(f"'total_workers' is {total_workers}. No workers will be started. Skipping ignition.")
        raise AirflowSkipException(f"No workers to start (total_workers={total_workers}).")
    if workers_per_bunch <= 0:
        logger.error(f"'workers_per_bunch' must be a positive integer, but got {workers_per_bunch}. Aborting.")
        raise AirflowException(f"'workers_per_bunch' must be a positive integer, but got {workers_per_bunch}.")
    # --- End Input Validation ---
    worker_delay = int(params['delay_between_workers_s'])
    bunch_delay = int(params['delay_between_bunches_s'])
    # Create a list of worker numbers to trigger
    worker_indices = list(range(total_workers))
    bunches = [worker_indices[i:i + workers_per_bunch] for i in range(0, len(worker_indices), workers_per_bunch)]
    # --- Inspect Queues before starting ---
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        redis_client = _get_redis_client(redis_conn_id)
        # First, check the application queue for work
        app_queue_len = _check_application_queue(redis_client, app_queue_name)
        if params.get('skip_if_queue_empty') and app_queue_len == 0:
            logger.info("'skip_if_queue_empty' is True and application queue is empty. Skipping worker ignition.")
            raise AirflowSkipException("Application work queue is empty.")
        # Then, inspect the target Celery queue for debugging
        _inspect_celery_queues(redis_client, [worker_queue])
    except AirflowSkipException:
        raise # Re-raise to let Airflow handle the skip
    except Exception as e:
        logger.error(f"Could not inspect queues due to an error: {e}. Continuing with ignition sequence.")
    # --- End of Inspection ---
    logger.info(f"Plan: Triggering {total_workers} total dispatcher runs in {len(bunches)} bunches. Each run will attempt to process one URL.")
    dag_run_id = context['dag_run'].run_id
    total_triggered = 0
    for i, bunch in enumerate(bunches):
        logger.info(f"--- Triggering Bunch {i+1}/{len(bunches)} (contains {len(bunch)} dispatcher(s)) ---")
        for j, _ in enumerate(bunch):
            # Create a unique run_id for each dispatcher run
            run_id = f"dispatched_{dag_run_id}_{total_triggered}"
            # Pass all orchestrator params to the dispatcher, which will then pass them to the worker.
            conf_to_pass = {p: params[p] for p in params}
            logger.info(f"Triggering dispatcher {j+1}/{len(bunch)} in bunch {i+1} (run {total_triggered + 1}/{total_workers}) (Run ID: {run_id})")
            logger.debug(f"Full conf for dispatcher run {run_id}: {conf_to_pass}")
            trigger_dag(
                dag_id=dispatcher_dag_id,
                run_id=run_id,
                conf=conf_to_pass,
                replace_microseconds=False
            )
            total_triggered += 1
            # Delay between dispatches in a bunch
            if j < len(bunch) - 1:
                logger.info(f"Waiting {worker_delay}s before next dispatcher in bunch...")
                time.sleep(worker_delay)
        # Delay between bunches
        if i < len(bunches) - 1:
            logger.info(f"--- Bunch {i+1} triggered. Waiting {bunch_delay}s before next bunch... ---")
            time.sleep(bunch_delay)
    logger.info(f"--- Ignition sequence complete. Total dispatcher runs triggered: {total_triggered}. ---")
    # --- Final Queue Inspection ---
    final_check_delay = 30  # seconds
    logger.info(f"Waiting {final_check_delay}s for a final queue status check to see if workers picked up tasks...")
    time.sleep(final_check_delay)
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        redis_client = _get_redis_client(redis_conn_id)
        # Log connection details for debugging broker mismatch issues
        conn_kwargs = redis_client.connection_pool.connection_kwargs
        logger.info(f"Final check using Redis connection '{redis_conn_id}': "
                    f"host={conn_kwargs.get('host')}, "
                    f"port={conn_kwargs.get('port')}, "
                    f"db={conn_kwargs.get('db')}")
        _inspect_celery_queues(redis_client, [worker_queue])
        logger.info("Final queue inspection complete. If queues are not empty, workers have not picked up tasks yet. "
                    "If queues are empty, workers have started processing.")
    except Exception as e:
        logger.error(f"Could not perform final queue inspection: {e}. This does not affect worker ignition.")
 # =============================================================================
 # DAG Definition
 # =============================================================================
 default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'start_date': days_ago(1),
 }
 with DAG(
    dag_id='ytdlp_ops_v02_orchestrator_dl',
    default_args=default_args,
    schedule=None, # This DAG runs only when triggered.
    max_active_runs=1, # Only one ignition process should run at a time.
    catchup=False,
    description='Ignition system for ytdlp_ops_v02_dispatcher_dl DAGs.',
    doc_md="""
    ### YT-DLP v2 (Download) Worker Ignition System
    This DAG acts as an "ignition system" to start one or more self-sustaining worker loops for the **v2 download worker**.
    It triggers `ytdlp_ops_v02_dispatcher_dl` DAGs, which pull job payloads from `queue2_dl_inbox` and trigger `ytdlp_ops_v02_worker_per_url_dl` workers.
    """,
    tags=['ytdlp', 'mgmt', 'master'],
    params={
        # --- Ignition Control Parameters ---
        'total_workers': Param(DEFAULT_TOTAL_WORKERS, type="integer", description="Total number of dispatcher loops to start."),
        'workers_per_bunch': Param(DEFAULT_WORKERS_PER_BUNCH, type="integer", description="Number of dispatchers to start in each bunch."),
        'delay_between_workers_s': Param(DEFAULT_WORKER_DELAY_S, type="integer", description="Delay in seconds between starting each dispatcher within a bunch."),
        'delay_between_bunches_s': Param(DEFAULT_BUNCH_DELAY_S, type="integer", description="Delay in seconds between starting each bunch."),
        'skip_if_queue_empty': Param(False, type="boolean", title="[Ignition Control] Skip if Queue Empty", description="If True, the orchestrator will not start any dispatchers if the application's work queue is empty."),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
        'clients': Param('mweb,web_camoufox,tv', type="string", title="[Worker Param] Clients", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
        # --- Download Control Parameters ---
        'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
        'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
        'skip_probe': Param(True, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
        'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
        'fragment_retries': Param(2, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
        'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
        'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
        'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
        'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
        'download_format_preset': Param(
            'formats_2',
            type="string",
            enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
            title="[Worker Param] Download Format Preset",
            description="Select a predefined format string or choose 'custom' to use the value from 'Custom Download Format'.\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
        ),
        'download_format_custom': Param(
            '18,140,299/298/137/136/135/134/133',
            type="string",
            title="[Worker Param] Custom Download Format",
            description="Custom yt-dlp format string. Used when preset is 'custom'. E.g., 'ba[ext=m4a]/bestaudio/best'."
        ),
        'downloader': Param(
            'default',
            type="string",
            enum=['default', 'aria2c'],
            title="[Worker Param] Downloader",
            description="Choose the downloader for yt-dlp."
        ),
        'downloader_args_aria2c': Param(
            'aria2c:-x 4 -k 2M --max-download-limit=3M',
            type="string",
            title="[Worker Param] Aria2c Downloader Arguments",
            description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
        ),
        'yt_dlp_extra_args': Param(
            '--restrict-filenames',
            type=["string", "null"],
            title="[Worker Param] Extra yt-dlp arguments",
            description="Extra command-line arguments for yt-dlp during download."
        ),
    }
 ) as dag:
    orchestrate_task = PythonOperator(
        task_id='start_worker_loops',
        python_callable=orchestrate_workers_ignition_callable,
    )
    orchestrate_task.doc_md = """
    ### Start Worker Loops
    This is the main task that executes the ignition policy.
    - It triggers `ytdlp_ops_v02_dispatcher_dl` DAGs according to the batch settings.
    - It passes all its parameters down to the dispatchers, which will use them to trigger workers.
    """
--- a/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py
+++ b/airflow/dags/ytdlp_ops_v02_worker_per_url_auth.py
@ -6,10 +6,10 @@
 # Distributed under terms of the MIT license.
 """
-DAG for processing a single YouTube URL passed via DAG run configuration.
+DAG for authenticating a single YouTube URL passed via DAG run configuration.
-This is the "Worker" part of a Sensor/Worker pattern.
+This is the "Auth Worker" part of a separated Auth/Download pattern.
-This DAG has been refactored to use the TaskFlow API to implement worker affinity,
+It acquires a token, saves the info.json, and pushes the token data to a
-ensuring all tasks for a single URL run on the same machine.
+Redis queue for the download worker.
 """
 from __future__ import annotations
@ -24,12 +24,15 @@ from airflow.operators.dummy import DummyOperator
 from airflow.utils.dates import days_ago
 from airflow.utils.task_group import TaskGroup
 from airflow.api.common.trigger_dag import trigger_dag
 from copy import copy
 from datetime import datetime, timedelta
 import concurrent.futures
 import json
 import logging
 import os
 import random
 import re
 import redis
 import socket
 import time
 import traceback
@ -37,7 +40,7 @@ import uuid
 # Import utility functions and Thrift modules
 from utils.redis_utils import _get_redis_client
-from pangramia.yt.common.ttypes import TokenUpdateMode
+from pangramia.yt.common.ttypes import TokenUpdateMode, AirflowLogContext
 from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
 from pangramia.yt.tokens_ops import YTTokenOpService
 from thrift.protocol import TBinaryProtocol
@ -47,20 +50,114 @@ from thrift.transport.TTransport import TTransportException
 # Configure logging
 logger = logging.getLogger(__name__)
 # --- Client Stats Helper ---
 def _update_client_stats(redis_client, clients_str: str, status: str, url: str, machine_id: str, dag_run_id: str):
    """Updates success/failure statistics for a client type in Redis."""
    if not clients_str:
        logger.warning("Cannot update client stats: 'clients' string is empty.")
        return
    # Assumption: The service tries clients in the order provided.
    # We attribute the result to the first client in the list.
    primary_client = clients_str.split(',')[0].strip()
    if not primary_client:
        logger.warning("Cannot update client stats: could not determine primary client.")
        return
    stats_key = "client_stats"
    try:
        # Using a pipeline with WATCH for safe concurrent updates.
        with redis_client.pipeline() as pipe:
            pipe.watch(stats_key)
            current_stats_json = redis_client.hget(stats_key, primary_client)
            stats = {}
            if current_stats_json:
                try:
                    stats = json.loads(current_stats_json)
                except json.JSONDecodeError:
                    logger.warning(f"Could not parse existing stats for client '{primary_client}'. Resetting stats.")
                    stats = {}
            stats.setdefault('success_count', 0)
            stats.setdefault('failure_count', 0)
            details = {
                'timestamp': time.time(), 'url': url,
                'machine_id': machine_id, 'dag_run_id': dag_run_id,
            }
            if status == 'success':
                stats['success_count'] += 1
                stats['latest_success'] = details
            elif status == 'failure':
                stats['failure_count'] += 1
                stats['latest_failure'] = details
            pipe.multi()
            pipe.hset(stats_key, primary_client, json.dumps(stats))
            pipe.execute()
        logger.info(f"Successfully updated '{status}' stats for client '{primary_client}'.")
    except redis.exceptions.WatchError:
        logger.warning(f"WatchError updating stats for client '{primary_client}'. Another process updated it. Skipping this update.")
    except Exception as e:
        logger.error(f"Failed to update client stats for '{primary_client}': {e}", exc_info=True)
 # Default settings from Airflow Variables or hardcoded fallbacks
-DEFAULT_QUEUE_NAME = 'video_queue'
+DEFAULT_QUEUE_NAME = 'queue2_auth'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_TIMEOUT = 3600
 DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
 DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
 DEFAULT_REQUEST_PARAMS = {
  "context_reuse_policy": {
    "enabled": True,
    "max_age_seconds": 86400,
    "reuse_visitor_id": True,
    "reuse_cookies": True
  },
  "token_generation_strategy": {
    "youtubei_js": {
      "generate_po_token": True,
      "generate_gvs_token": True
    }
  },
  "ytdlp_params": {
    "use_curl_prefetch": False,
    "token_supplement_strategy": {
      "youtubepot_bgutilhttp_extractor": {
        "enabled": True
      }
    },
    "visitor_id_override": {
      "enabled": True
    }
  },
  "session_params": {
    "lang": "en-US",
    "location": "US",
    "deviceCategory": "MOBILE",
    "user_agents": {
      "youtubei_js": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)",
      "yt_dlp": "Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)"
    }
  }
 }
 # The queue is set to a fallback here. The actual worker-specific queue is
 # assigned just-in-time by the task_instance_mutation_hook (see: airflow/config/custom_task_hooks.py),
 # which parses the target queue from the DAG run_id.
 DEFAULT_ARGS = {
    'owner': 'airflow',
    'retries': 0,
-    'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
+    'queue': 'queue-auth', # Fallback queue. Will be overridden by the policy hook.
 }
@ -105,7 +202,15 @@ def _get_account_pool(params: dict) -> list:
        if pool_size_param is not None:
            is_prefix_mode = True
            pool_size = int(pool_size_param)
-            accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
+            
            if params.get('prepend_client_to_account', True):
                clients_str = params.get('clients', '')
                primary_client = clients_str.split(',')[0].strip() if clients_str else 'unknown'
                timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
                new_prefix = f"{prefix}_{timestamp}_{primary_client}"
                accounts = [f"{new_prefix}_{i:02d}" for i in range(1, pool_size + 1)]
            else:
                accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
        else:
            accounts = [prefix]
@ -140,6 +245,61 @@ def _get_account_pool(params: dict) -> list:
    logger.info(f"Final active account pool with {len(accounts)} accounts.")
    return accounts
@task
 def list_available_formats(token_data: dict, **context):
    """
    Lists available formats for the given video using the info.json.
    This is for debugging and informational purposes.
    """
    import subprocess
    import shlex
    info_json_path = token_data.get('info_json_path')
    if not (info_json_path and os.path.exists(info_json_path)):
        logger.warning(f"Cannot list formats: info.json path is missing or file does not exist ({info_json_path}).")
        return []
    try:
        cmd = [
            'yt-dlp',
            '--verbose',
            '--list-formats',
            '--load-info-json', info_json_path,
        ]
        copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
        logger.info(f"Executing yt-dlp command to list formats: {copy_paste_cmd}")
        process = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        if process.stderr:
            logger.info(f"yt-dlp --list-formats STDERR:\n{process.stderr}")
        if process.returncode != 0:
            logger.error(f"yt-dlp --list-formats failed with exit code {process.returncode}")
        available_formats = []
        if process.stdout:
            logger.info(f"--- Available Formats ---\n{process.stdout}\n--- End of Formats ---")
            # Parse the output to get format IDs
            lines = process.stdout.split('\n')
            header_found = False
            for line in lines:
                if line.startswith('ID '):
                    header_found = True
                    continue
                if header_found and line.strip() and line.strip()[0].isdigit():
                    format_id = line.split()[0]
                    available_formats.append(format_id)
            logger.info(f"Parsed available format IDs: {available_formats}")
        return available_formats
    except Exception as e:
        logger.error(f"An error occurred while trying to list formats: {e}", exc_info=True)
        return []
 # =============================================================================
 # TASK DEFINITIONS (TaskFlow API)
 # =============================================================================
@ -178,12 +338,36 @@ def get_url_and_assign_account(**context):
        logger.info(f"Worker pinning verified. Task is correctly running on queue '{ti.queue}'.")
    # --- End Verification ---
-    # The URL is passed by the dispatcher DAG.
+    # The URL is passed by the dispatcher DAG via 'url_to_process'.
    # For manual runs, we fall back to 'manual_url_to_process'.
    url_to_process = params.get('url_to_process')
    if not url_to_process:
-        raise AirflowException("'url_to_process' was not found in the DAG run configuration.")
+        url_to_process = params.get('manual_url_to_process')
        if url_to_process:
            logger.info(f"Using URL from manual run parameter: '{url_to_process}'")
    if not url_to_process:
        raise AirflowException("No URL to process. For manual runs, please provide a URL in the 'manual_url_to_process' parameter.")
    logger.info(f"Received URL '{url_to_process}' to process.")
    # Mark the URL as in-progress in Redis
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
        progress_queue = f"{queue_name}_progress"
        client = _get_redis_client(redis_conn_id)
        progress_data = {
            'status': 'in_progress',
            'start_time': time.time(),
            'dag_run_id': context['dag_run'].run_id,
            'hostname': socket.gethostname(),
        }
        client.hset(progress_queue, url_to_process, json.dumps(progress_data))
        logger.info(f"Marked URL '{url_to_process}' as in-progress.")
    except Exception as e:
        logger.error(f"Could not mark URL as in-progress in Redis: {e}", exc_info=True)
    # Account assignment logic is the same as before.
    account_id = random.choice(_get_account_pool(params))
    logger.info(f"Selected account '{account_id}' for this run.")
@ -206,22 +390,100 @@ def get_token(initial_data: dict, **context):
    host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
    machine_id = params.get('machine_id') or socket.gethostname()
    clients = params.get('clients')
    request_params_json = params.get('request_params_json', '{}')
    assigned_proxy_url = params.get('assigned_proxy_url')
    # Pretty-print the request parameters for debugging
    try:
        pretty_request_params = json.dumps(json.loads(request_params_json), indent=2)
        logger.info(f"\n--- Request Parameters ---\n{pretty_request_params}\n--- End of Request Parameters ---")
    except (json.JSONDecodeError, TypeError):
        logger.warning("Could not parse request_params_json. Using raw content.")
        logger.info(f"\n--- Raw Request Parameters ---\n{request_params_json}\n--- End of Raw Request Parameters ---")
    # Construct Airflow log context to pass to the service
    try:
        from airflow.configuration import conf
        remote_base = conf.get('logging', 'remote_base_log_folder')
        log_path = (
            f"{remote_base}/dag_id={ti.dag_id}/run_id={ti.run_id}/"
            f"task_id={ti.task_id}/attempt={ti.try_number}.log"
        )
        airflow_log_context = AirflowLogContext(
            logS3Path=log_path,
            dagId=ti.dag_id,
            runId=ti.run_id,
            taskId=ti.task_id,
            tryNumber=ti.try_number,
            workerHostname=socket.gethostname(),
            queue=ti.queue
        )
        logger.info(f"Constructed Airflow log context for yt-ops service: {airflow_log_context}")
    except Exception as e:
        logger.warning(f"Could not construct full Airflow log context: {e}. Creating a basic one.")
        airflow_log_context = AirflowLogContext(
            dagId=ti.dag_id,
            runId=ti.run_id,
            taskId=ti.task_id,
            tryNumber=ti.try_number,
            workerHostname=socket.gethostname(),
            queue=ti.queue
        )
-    logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
+    logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' (Clients: {clients}) ---")
    client, transport = None, None
    try:
        client, transport = _get_thrift_client(host, port, timeout)
-        token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id)
+        token_data = client.getOrRefreshToken(
            accountId=account_id,
            updateType=TokenUpdateMode.AUTO,
            url=url,
            clients=clients,
            machineId=machine_id,
            airflowLogContext=airflow_log_context,
            requestParamsJson=request_params_json,
            assignedProxyUrl=assigned_proxy_url
        )
        # Log a compact summary of the Thrift response, omitting large/detailed fields.
        summary_token_data = copy(token_data)
        if hasattr(summary_token_data, 'infoJson') and summary_token_data.infoJson:
            summary_token_data.infoJson = f"... ({len(summary_token_data.infoJson)} bytes) ..."
        if hasattr(summary_token_data, 'cookiesBlob') and summary_token_data.cookiesBlob:
            summary_token_data.cookiesBlob = f"... ({len(summary_token_data.cookiesBlob)} bytes) ..."
        # These will be logged separately below.
        if hasattr(summary_token_data, 'requestSummary'):
            summary_token_data.requestSummary = "..."
        if hasattr(summary_token_data, 'communicationLogPaths'):
            summary_token_data.communicationLogPaths = "..."
        logger.info(f"Thrift service response summary: {summary_token_data}")
        request_summary = getattr(token_data, 'requestSummary', None)
        if request_summary:
            # Prepending a newline for better separation in logs.
            logger.info(f"\n--- Request Summary ---\n{request_summary}")
        communication_log_paths = getattr(token_data, 'communicationLogPaths', None)
        if communication_log_paths:
            logger.info("--- Communication Log Paths ---")
            for path in communication_log_paths:
                logger.info(f"  - {path}")
        info_json = getattr(token_data, 'infoJson', None)
        if not (info_json and json.loads(info_json)):
            raise AirflowException("Service returned success but info.json was empty or invalid.")
        video_id = _extract_video_id(url)
        os.makedirs(info_json_dir, exist_ok=True)
        # Use a readable timestamp for a unique filename on each attempt.
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
+
        # Create a unique directory for this job's artifacts
        job_dir_name = f"{timestamp}-{video_id or 'unknown'}"
        job_dir_path = os.path.join(info_json_dir, job_dir_name)
        os.makedirs(job_dir_path, exist_ok=True)
        info_json_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
        info_json_path = os.path.join(job_dir_path, info_json_filename)
        with open(info_json_path, 'w', encoding='utf-8') as f:
            f.write(info_json)
@ -232,6 +494,7 @@ def get_token(initial_data: dict, **context):
            'ytdlp_command': getattr(token_data, 'ytdlpCommand', None),
            'successful_account_id': account_id,
            'original_url': url,  # Include original URL for fallback
            'clients': clients, # Pass clients string for accurate stats
        }
    except (PBServiceException, PBUserException, TTransportException) as e:
        error_context = getattr(e, 'context', None)
@ -297,8 +560,11 @@ def handle_bannable_error_branch(task_id_to_check: str, **context):
            return 'ban_account_and_prepare_for_retry'
        if policy in ['retry_on_connection_error', 'retry_without_ban']:
            return 'assign_new_account_for_direct_retry'
-        if policy == 'stop_loop':
+        if policy in ['stop_loop', 'stop_loop_on_auth_proceed_on_download_error']:
            return 'ban_and_report_immediately'
        if policy == 'proceed_loop_under_manual_inspection':
            logger.warning(f"Bannable error with 'proceed_loop_under_manual_inspection' policy. Reporting failure and continuing loop. MANUAL INTERVENTION IS LIKELY REQUIRED.")
            return 'report_bannable_and_continue'
    # Any other error is considered fatal for this run.
    logger.error(f"Unhandled or non-retryable error '{error_code}' from '{task_id_to_check}'. Marking as fatal.")
@ -447,121 +713,43 @@ def ban_and_report_immediately(initial_data: dict, reason: str, **context):
    return initial_data # Pass data along if needed by reporting
@task
-def download_and_probe(token_data: dict, **context):
+def push_auth_success_to_redis(initial_data: dict, token_data: dict, **context):
    """
-    Uses the retrieved token data to download and probe the media file.
+    On successful token acquisition, pushes the complete token data to the
-    This version uses subprocess directly with an argument list for better security and clarity.
+    Redis queue for the download worker and records the auth success.
    """
    import subprocess
    import shlex
    params = context['params']
    info_json_path = token_data.get('info_json_path')
    proxy = token_data.get('socks_proxy')
    original_url = token_data.get('original_url')
    download_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video')
    download_format = params.get('download_format', 'ba[ext=m4a]/bestaudio/best')
    output_template = params.get('output_path_template', "%(title)s [%(id)s].%(ext)s")
    full_output_path = os.path.join(download_dir, output_template)
    retry_on_probe_failure = params.get('retry_on_probe_failure', False)
    if not (info_json_path and os.path.exists(info_json_path)):
        raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
    def run_yt_dlp():
        """Constructs and runs the yt-dlp command, returning the final filename."""
        cmd = [
            'yt-dlp',
            '--verbose',
            '--load-info-json', info_json_path,
            '-f', download_format,
            '-o', full_output_path,
            '--print', 'filename',
            '--continue',
            '--no-progress',
            '--no-simulate',
            '--no-write-info-json',
            '--ignore-errors',
            '--no-playlist',
        ]
        if proxy:
            cmd.extend(['--proxy', proxy])
        # Crucially, add the original URL to allow yt-dlp to refresh expired download links,
        # which is the most common cause of HTTP 403 errors.
        if original_url:
            cmd.append(original_url)
        copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
        logger.info(f"Executing yt-dlp command: {copy_paste_cmd}")
        process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
        if process.returncode != 0:
            logger.error(f"yt-dlp failed with exit code {process.returncode}")
            logger.error(f"STDOUT: {process.stdout}")
            logger.error(f"STDERR: {process.stderr}")
            raise AirflowException("yt-dlp command failed.")
        # Get the last line of stdout, which should be the filename
        final_filename = process.stdout.strip().split('\n')[-1]
        if not (final_filename and os.path.exists(final_filename)):
            logger.error(f"Download command finished but the output file does not exist: '{final_filename}'")
            logger.error(f"Full STDOUT:\n{process.stdout}")
            logger.error(f"Full STDERR:\n{process.stderr}")
            raise AirflowException(f"Download failed or did not produce a file: {final_filename}")
        logger.info(f"SUCCESS: Download complete. Final file at: {final_filename}")
        return final_filename
    def run_ffmpeg_probe(filename):
        """Probes the given file with ffmpeg to check for corruption."""
        logger.info(f"Probing downloaded file: {filename}")
        try:
            subprocess.run(['ffmpeg', '-v', 'error', '-i', filename, '-f', 'null', '-'], check=True, capture_output=True, text=True)
            logger.info("SUCCESS: Probe confirmed valid media file.")
        except subprocess.CalledProcessError as e:
            logger.error(f"ffmpeg probe check failed for '{filename}'. The file might be corrupt.")
            logger.error(f"ffmpeg STDERR: {e.stderr}")
            raise AirflowException("ffmpeg probe failed.")
    # --- Main Execution Logic ---
    final_filename = run_yt_dlp()
    try:
        run_ffmpeg_probe(final_filename)
        return final_filename
    except AirflowException as e:
        if "probe failed" in str(e) and retry_on_probe_failure:
            logger.warning("Probe failed. Attempting one re-download...")
            try:
                # Rename the failed file to allow for a fresh download attempt
                part_file = f"{final_filename}.part"
                os.rename(final_filename, part_file)
                logger.info(f"Renamed corrupted file to {part_file}")
            except OSError as rename_err:
                logger.error(f"Could not rename corrupted file: {rename_err}")
            final_filename_retry = run_yt_dlp()
            run_ffmpeg_probe(final_filename_retry)
            return final_filename_retry
        else:
            # Re-raise the original exception if no retry is attempted
            raise
@task
 def mark_url_as_success(initial_data: dict, downloaded_file_path: str, token_data: dict, **context):
    """Records the successful result in Redis."""
    params = context['params']
    url = initial_data['url_to_process']
-    result_data = {
+    
-        'status': 'success', 'end_time': time.time(), 'url': url,
+    # The download inbox queue is derived from the auth queue name.
-        'downloaded_file_path': downloaded_file_path, **token_data,
+    dl_inbox_queue = f"{params['queue_name'].replace('_auth', '_dl')}_inbox"
-        'dag_run_id': context['dag_run'].run_id,
+    auth_result_queue = f"{params['queue_name']}_result"
-    }
+    progress_queue = f"{params['queue_name']}_progress"
    client = _get_redis_client(params['redis_conn_id'])
-    client.hset(f"{params['queue_name']}_result", url, json.dumps(result_data))
+    
-    logger.info(f"Stored success result for URL '{url}'.")
+    payload = {
        'timestamp': time.time(),
        'dag_run_id': context['dag_run'].run_id,
        **token_data
    }
    result_data = {
        'status': 'success',
        'end_time': time.time(),
        'url': url,
        'dag_run_id': context['dag_run'].run_id,
        'token_data': token_data
    }
    with client.pipeline() as pipe:
        pipe.lpush(dl_inbox_queue, json.dumps(payload))
        pipe.hset(auth_result_queue, url, json.dumps(result_data))
        pipe.hdel(progress_queue, url)
        pipe.execute()
    logger.info(f"Pushed successful auth data for URL '{url}' to '{dl_inbox_queue}'.")
    logger.info(f"Stored success result for auth on URL '{url}' in '{auth_result_queue}'.")
@task(trigger_rule='one_failed')
 def report_failure_and_continue(**context):
@ -606,15 +794,26 @@ def report_failure_and_continue(**context):
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            _update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on failure: {e}", exc_info=True)
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
-        logger.info(f"Stored failure result for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
+        logger.info(f"Stored failure result for URL '{url}' in '{result_queue}' and '{fail_queue}' and removed from progress queue.")
    except Exception as e:
        logger.error(f"Could not report failure to Redis: {e}", exc_info=True)
@ -648,6 +847,15 @@ def handle_fatal_error(**context):
    # Report failure to Redis so the URL can be reprocessed later
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            _update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on fatal error: {e}", exc_info=True)
        result_data = {
            'status': 'failed',
            'end_time': time.time(),
@ -657,13 +865,15 @@ def handle_fatal_error(**context):
            'error_message': 'Fatal non-retryable error occurred',
            'error_details': error_details
        }
        client = _get_redis_client(params['redis_conn_id'])
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
        logger.info(f"Stored fatal error result for URL '{url}' in '{result_queue}' and '{fail_queue}' for later reprocessing.")
@ -683,6 +893,12 @@ def continue_processing_loop(**context):
    params = context['params']
    dag_run = context['dag_run']
    # Do not continue the loop for manual runs of the worker DAG.
    # A worker DAG triggered by the dispatcher will have a run_id starting with 'worker_run_'.
    if not dag_run.run_id.startswith('worker_run_'):
        logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
        return
    # Create a new unique run_id for the dispatcher.
    # Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
    # preventing database errors.
@ -697,7 +913,7 @@ def continue_processing_loop(**context):
    logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
    trigger_dag(
-        dag_id='ytdlp_ops_dispatcher',
+        dag_id='ytdlp_ops_v02_dispatcher_auth',
        run_id=new_dispatcher_run_id,
        conf=conf_to_pass,
        replace_microseconds=False
@ -711,6 +927,7 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
    On retry, most errors are considered fatal for the URL, but not for the system.
    """
    ti = context['task_instance']
    params = context['params']
    error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
    if not error_details:
        return 'handle_fatal_error'
@ -720,8 +937,8 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
    # Check if this is an age confirmation error - should not stop the loop
    if "Sign in to confirm your age" in error_message or "confirm your age" in error_message.lower():
-        logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. Reporting failure and continuing loop.")
+        logger.info(f"Age confirmation error detected on retry from '{task_id_to_check}'. This is a content restriction, not a bot detection issue.")
-        return 'report_failure_and_continue'
+        return 'handle_age_restriction_error'
    if error_code == 'TRANSPORT_ERROR':
        logger.error(f"Fatal Thrift connection error on retry from '{task_id_to_check}'.")
@ -729,6 +946,11 @@ def handle_retry_failure_branch(task_id_to_check: str, **context):
    is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
    if is_bannable:
        policy = params.get('on_bannable_failure', 'retry_with_new_account')
        if policy == 'proceed_loop_under_manual_inspection':
            logger.warning(f"Bannable error '{error_code}' on retry with 'proceed_loop_under_manual_inspection' policy. Reporting failure and continuing loop. MANUAL INTERVENTION IS LIKELY REQUIRED.")
            return 'report_bannable_and_continue'
        logger.warning(f"Bannable error '{error_code}' on retry. Banning account and reporting failure.")
        return 'ban_and_report_after_retry'
@ -745,11 +967,6 @@ def ban_and_report_after_retry(retry_data: dict, reason: str, **context):
    return retry_data
@task.branch(trigger_rule='one_failed')
 def handle_download_failure_branch(**context):
    """If download or probe fails, routes to the standard failure reporting."""
    logger.warning("Download or probe failed. Reporting failure and continuing loop.")
    return 'report_failure_and_continue'
@task(trigger_rule='one_success')
@ -768,7 +985,69 @@ def coalesce_token_data(get_token_result=None, retry_get_token_result=None):
    raise AirflowException("Could not find a successful token result from any attempt.")
-@task(trigger_rule='one_failed')
+@task
 def report_bannable_and_continue(**context):
    """
    Handles a bannable error by reporting it, but continues the loop
    as per the 'proceed_loop_under_manual_inspection' policy.
    """
    params = context['params']
    ti = context['task_instance']
    url = params.get('url_to_process', 'unknown')
    # Collect error details
    error_details = {}
    first_token_task_id = 'get_token'
    retry_token_task_id = 'retry_get_token'
    first_token_error = ti.xcom_pull(task_ids=first_token_task_id, key='error_details')
    retry_token_error = ti.xcom_pull(task_ids=retry_token_task_id, key='error_details')
    # Use the most recent error details
    if retry_token_error:
        error_details = retry_token_error
    elif first_token_error:
        error_details = first_token_error
    logger.error(f"Bannable error for URL '{url}'. Policy is to continue loop under manual supervision.")
    # Report failure to Redis
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            _update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on bannable error: {e}", exc_info=True)
        result_data = {
            'status': 'failed',
            'end_time': time.time(),
            'url': url,
            'dag_run_id': context['dag_run'].run_id,
            'error': 'bannable_error_manual_override',
            'error_message': 'Bannable error occurred, but policy is set to continue loop under manual supervision.',
            'error_details': error_details
        }
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
        logger.info(f"Stored bannable error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
    except Exception as e:
        logger.error(f"Could not report bannable error to Redis: {e}", exc_info=True)
@task
 def handle_age_restriction_error(**context):
    """
    Handles age restriction errors specifically. These are content restrictions
@ -797,6 +1076,15 @@ def handle_age_restriction_error(**context):
    # Report failure to Redis so the URL can be marked as failed
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            _update_client_stats(client, params.get('clients', ''), 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on age restriction error: {e}", exc_info=True)
        result_data = {
            'status': 'failed',
            'end_time': time.time(),
@ -806,13 +1094,15 @@ def handle_age_restriction_error(**context):
            'error_message': 'Content requires age confirmation',
            'error_details': error_details
        }
        client = _get_redis_client(params['redis_conn_id'])
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
        logger.info(f"Stored age restriction error for URL '{url}' in '{result_queue}' and '{fail_queue}'.")
@ -826,7 +1116,7 @@ def handle_age_restriction_error(**context):
 # DAG Definition with TaskGroups
 # =============================================================================
 with DAG(
-    dag_id='ytdlp_ops_worker_per_url',
+    dag_id='ytdlp_ops_v02_worker_per_url_auth',
    default_args=DEFAULT_ARGS,
    schedule=None,
    start_date=days_ago(1),
@ -834,6 +1124,7 @@ with DAG(
    tags=['ytdlp', 'worker'],
    doc_md=__doc__,
    render_template_as_native_obj=True,
    is_paused_upon_creation=True,
    params={
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string"),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string"),
@ -841,17 +1132,18 @@ with DAG(
        'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer"),
        'account_pool': Param('default_account', type="string"),
        'account_pool_size': Param(None, type=["integer", "null"]),
        'prepend_client_to_account': Param(True, type="boolean", title="[Worker Param] Prepend Client to Account", description="If True, prepends client and timestamp to account names in prefix mode."),
        'machine_id': Param(None, type=["string", "null"]),
-        'clients': Param('web', type="string"),
+        'assigned_proxy_url': Param(None, type=["string", "null"], title="[Worker Param] Assigned Proxy URL", description="If provided, forces the token service to use this specific proxy for the request."),
        'clients': Param('mweb', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
        'timeout': Param(DEFAULT_TIMEOUT, type="integer"),
-        'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string"),
+        'on_bannable_failure': Param('stop_loop_on_auth_proceed_on_download_error', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error', 'proceed_loop_under_manual_inspection', 'stop_loop_on_auth_proceed_on_download_error']),
-        'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string"),
+        'request_params_json': Param(json.dumps(DEFAULT_REQUEST_PARAMS), type="string", title="[Worker Param] Request Params JSON", description="JSON string with request parameters for the token service."),
        'on_bannable_failure': Param('retry_with_new_account', type="string", enum=['stop_loop', 'retry_with_new_account', 'retry_without_ban', 'retry_and_ban_account_only', 'retry_on_connection_error']),
        'retry_on_probe_failure': Param(False, type="boolean"),
        'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean"),
-        # Internal params passed from dispatcher
+        # --- Manual Run / Internal Parameters ---
-        'url_to_process': Param(None, type=["string", "null"]),
+        'manual_url_to_process': Param('iPwdia3gAnk', type=["string", "null"], title="[Manual Run] URL to Process", description="For manual runs, provide a single YouTube URL to process. This is ignored if triggered by the dispatcher."),
-        'worker_queue': Param(None, type=["string", "null"]),
+        'url_to_process': Param(None, type=["string", "null"], title="[Internal] URL from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
        'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
    }
 ) as dag:
    initial_data = get_url_and_assign_account()
@ -863,6 +1155,7 @@ with DAG(
    report_failure_task = report_failure_and_continue()
    continue_loop_task = continue_processing_loop()
    age_restriction_task = handle_age_restriction_error()
    report_bannable_and_continue_task = report_bannable_and_continue()
    # --- Task Group 1: Initial Attempt ---
    with TaskGroup("initial_attempt", tooltip="Initial token acquisition attempt") as initial_attempt_group:
@ -878,7 +1171,7 @@ with DAG(
        )
        first_token_attempt >> initial_branch_task
-        initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task]
+        initial_branch_task >> [fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
    # --- Task Group 2: Retry Logic ---
    with TaskGroup("retry_logic", tooltip="Retry logic with account management") as retry_logic_group:
@ -928,42 +1221,40 @@ with DAG(
        direct_retry_account_task >> coalesced_retry_data
        coalesced_retry_data >> retry_token_task
        retry_token_task >> retry_branch_task
-        retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task]
+        retry_branch_task >> [fatal_error_task, report_failure_task, ban_after_retry_report_task, age_restriction_task, report_bannable_and_continue_task]
        ban_after_retry_report_task >> report_failure_task
-    # --- Task Group 3: Download and Processing ---
+    # --- Task Group 3: Success/Continuation Logic ---
-    with TaskGroup("download_processing", tooltip="Download and media processing") as download_processing_group:
+    with TaskGroup("success_and_continuation", tooltip="Push to DL queue and continue loop") as success_group:
        # Coalesce, download, and success tasks
        token_data = coalesce_token_data(
            get_token_result=first_token_attempt,
            retry_get_token_result=retry_token_task
        )
-        download_task = download_and_probe(token_data=token_data)
+        list_formats_task = list_available_formats(token_data=token_data)
-        download_branch_task = handle_download_failure_branch.override(trigger_rule='one_failed')()
+        success_task = push_auth_success_to_redis(
        success_task = mark_url_as_success(
            initial_data=initial_data,
            downloaded_file_path=download_task,
            token_data=token_data
        )
        # Internal dependencies within download group
        first_token_attempt >> token_data
        retry_token_task >> token_data
-        token_data >> download_task
+        token_data >> list_formats_task >> success_task
        download_task >> download_branch_task
        download_branch_task >> report_failure_task
        download_task >> success_task
        success_task >> continue_loop_task
    # --- DAG Dependencies between TaskGroups ---
    # Initial attempt can lead to retry logic or direct failure
-    initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task]
+    initial_branch_task >> [retry_logic_group, fatal_error_task, ban_and_report_immediately_task, age_restriction_task, report_bannable_and_continue_task]
-    # Retry logic leads to download processing on success or failure reporting on failure
+    # A successful initial attempt bypasses retry and goes straight to the success group
-    retry_branch_task >> [download_processing_group, report_failure_task]
+    initial_attempt_group >> success_group
    # Retry logic leads to success/continuation on success or failure reporting on failure
    retry_branch_task >> [report_failure_task] # Handled within the group
    retry_logic_group >> success_group
    # Ban and report immediately leads to failure reporting
    ban_and_report_immediately_task >> report_failure_task
    # Age restriction error leads to failure reporting and continues the loop
    age_restriction_task >> continue_loop_task
    report_bannable_and_continue_task >> continue_loop_task
--- a/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py
+++ b/airflow/dags/ytdlp_ops_v02_worker_per_url_dl.py
@ -0,0 +1,895 @@
 # -*- coding: utf-8 -*-
 # vim:fenc=utf-8
 #
 # Copyright © 2024 rl <rl@rlmbp>
 #
 # Distributed under terms of the MIT license.
 """
 DAG for downloading a single YouTube URL based on pre-fetched token data.
 This is the "Download Worker" part of a separated Auth/Download pattern.
 It receives a job payload with all necessary token info and handles only the
 downloading and probing of media files.
 """
 from __future__ import annotations
 from airflow.decorators import task, task_group
 from airflow.exceptions import AirflowException, AirflowSkipException
 from airflow.models import Variable
 from airflow.models.dag import DAG
 from airflow.models.param import Param
 from airflow.models.xcom_arg import XComArg
 from airflow.operators.dummy import DummyOperator
 from airflow.utils.dates import days_ago
 from airflow.utils.task_group import TaskGroup
 from airflow.api.common.trigger_dag import trigger_dag
 from datetime import datetime, timedelta
 import concurrent.futures
 import json
 import logging
 import os
 import random
 import re
 import redis
 import socket
 import time
 import traceback
 import uuid
 # Import utility functions and Thrift modules
 from utils.redis_utils import _get_redis_client
 from pangramia.yt.common.ttypes import TokenUpdateMode, AirflowLogContext
 from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
 from pangramia.yt.tokens_ops import YTTokenOpService
 from thrift.protocol import TBinaryProtocol
 from thrift.transport import TSocket, TTransport
 from thrift.transport.TTransport import TTransportException
 # Configure logging
 logger = logging.getLogger(__name__)
 # --- Client Stats Helper ---
 def _update_client_stats(redis_client, clients_str: str, status: str, url: str, machine_id: str, dag_run_id: str):
    """Updates success/failure statistics for a client type in Redis."""
    if not clients_str:
        logger.warning("Cannot update client stats: 'clients' string is empty.")
        return
    # Assumption: The service tries clients in the order provided.
    # We attribute the result to the first client in the list.
    primary_client = clients_str.split(',')[0].strip()
    if not primary_client:
        logger.warning("Cannot update client stats: could not determine primary client.")
        return
    stats_key = "client_stats"
    try:
        # Using a pipeline with WATCH for safe concurrent updates.
        with redis_client.pipeline() as pipe:
            pipe.watch(stats_key)
            current_stats_json = redis_client.hget(stats_key, primary_client)
            stats = {}
            if current_stats_json:
                try:
                    stats = json.loads(current_stats_json)
                except json.JSONDecodeError:
                    logger.warning(f"Could not parse existing stats for client '{primary_client}'. Resetting stats.")
                    stats = {}
            stats.setdefault('success_count', 0)
            stats.setdefault('failure_count', 0)
            details = {
                'timestamp': time.time(), 'url': url,
                'machine_id': machine_id, 'dag_run_id': dag_run_id,
            }
            if status == 'success':
                stats['success_count'] += 1
                stats['latest_success'] = details
            elif status == 'failure':
                stats['failure_count'] += 1
                stats['latest_failure'] = details
            pipe.multi()
            pipe.hset(stats_key, primary_client, json.dumps(stats))
            pipe.execute()
        logger.info(f"Successfully updated '{status}' stats for client '{primary_client}'.")
    except redis.exceptions.WatchError:
        logger.warning(f"WatchError updating stats for client '{primary_client}'. Another process updated it. Skipping this update.")
    except Exception as e:
        logger.error(f"Failed to update client stats for '{primary_client}': {e}", exc_info=True)
 # Default settings from Airflow Variables or hardcoded fallbacks
 DEFAULT_QUEUE_NAME = 'queue2_dl'
 DEFAULT_REDIS_CONN_ID = 'redis_default'
 DEFAULT_TIMEOUT = 3600
 DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
 DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
 # The queue is set to a fallback here. The actual worker-specific queue is
 # assigned just-in-time by the task_instance_mutation_hook (see: airflow/config/custom_task_hooks.py),
 # which parses the target queue from the DAG run_id.
 DEFAULT_ARGS = {
    'owner': 'airflow',
    'retries': 0,
    'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
 }
 # --- Helper Functions ---
 def _extract_video_id(url):
    """Extracts YouTube video ID from URL."""
    if not url or not isinstance(url, str):
        return None
    patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None
 # =============================================================================
 # TASK DEFINITIONS (TaskFlow API)
 # =============================================================================
@task
 def get_download_job_from_conf(**context):
    """
    Gets the download job details (which includes token data) from the DAG run conf.
    This is the first task in the download worker DAG.
    """
    params = context['params']
    ti = context['task_instance']
    # --- Worker Pinning Verification ---
    # This is a safeguard against a known Airflow issue where clearing a task
    # can cause the task_instance_mutation_hook to be skipped, breaking pinning.
    # See: https://github.com/apache/airflow/issues/20143
    expected_queue = None
    if ti.run_id and '_q_' in ti.run_id:
        expected_queue = ti.run_id.split('_q_')[-1]
    if not expected_queue:
        # Fallback to conf if run_id parsing fails for some reason
        expected_queue = params.get('worker_queue')
    if expected_queue and ti.queue != expected_queue:
        error_msg = (
            f"WORKER PINNING FAILURE: Task is running on queue '{ti.queue}' but was expected on '{expected_queue}'. "
            "This usually happens after manually clearing a task, which is not the recommended recovery method for this DAG. "
            "To recover a failed URL, let the DAG run fail, use the 'ytdlp_mgmt_queues' DAG to requeue the URL, "
            "and use the 'ytdlp_ops_orchestrator' to start a new worker loop if needed."
        )
        logger.error(error_msg)
        raise AirflowException(error_msg)
    elif expected_queue:
        logger.info(f"Worker pinning verified. Task is correctly running on queue '{ti.queue}'.")
    # --- End Verification ---
    # The job data is passed by the dispatcher DAG via 'job_data'.
    job_data = params.get('job_data')
    if not job_data:
        raise AirflowException("No job_data provided in DAG run configuration.")
    # If job_data is a string, parse it as JSON
    if isinstance(job_data, str):
        try:
            job_data = json.loads(job_data)
        except json.JSONDecodeError:
            raise AirflowException(f"Could not decode job_data JSON: {job_data}")
    url_to_process = job_data.get('original_url')
    if not url_to_process:
        raise AirflowException("'original_url' not found in job_data.")
    logger.info(f"Received job for URL '{url_to_process}'.")
    # Mark the URL as in-progress in Redis
    try:
        redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
        queue_name = params.get('queue_name', DEFAULT_QUEUE_NAME)
        progress_queue = f"{queue_name}_progress"
        client = _get_redis_client(redis_conn_id)
        progress_data = {
            'status': 'in_progress',
            'start_time': time.time(),
            'dag_run_id': context['dag_run'].run_id,
            'hostname': socket.gethostname(),
        }
        client.hset(progress_queue, url_to_process, json.dumps(progress_data))
        logger.info(f"Marked URL '{url_to_process}' as in-progress.")
    except Exception as e:
        logger.error(f"Could not mark URL as in-progress in Redis: {e}", exc_info=True)
    return job_data
@task
 def list_available_formats(token_data: dict, **context):
    """
    Lists available formats for the given video using the info.json.
    This is for debugging and informational purposes.
    """
    import subprocess
    import shlex
    info_json_path = token_data.get('info_json_path')
    if not (info_json_path and os.path.exists(info_json_path)):
        logger.warning(f"Cannot list formats: info.json path is missing or file does not exist ({info_json_path}).")
        return []
    try:
        cmd = [
            'yt-dlp',
            '--verbose',
            '--list-formats',
            '--load-info-json', info_json_path,
        ]
        copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
        logger.info(f"Executing yt-dlp command to list formats: {copy_paste_cmd}")
        process = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        if process.stderr:
            logger.info(f"yt-dlp --list-formats STDERR:\n{process.stderr}")
        if process.returncode != 0:
            logger.error(f"yt-dlp --list-formats failed with exit code {process.returncode}")
        available_formats = []
        if process.stdout:
            logger.info(f"--- Available Formats ---\n{process.stdout}\n--- End of Formats ---")
            # Parse the output to get format IDs
            lines = process.stdout.split('\n')
            header_found = False
            for line in lines:
                if line.startswith('ID '):
                    header_found = True
                    continue
                if header_found and line.strip() and line.strip()[0].isdigit():
                    format_id = line.split()[0]
                    available_formats.append(format_id)
            logger.info(f"Parsed available format IDs: {available_formats}")
        return available_formats
    except Exception as e:
        logger.error(f"An error occurred while trying to list formats: {e}", exc_info=True)
        return []
@task
 def download_and_probe(token_data: dict, available_formats: list[str], **context):
    """
    Uses retrieved token data to download and probe media files.
    Supports parallel downloading of specific, comma-separated format IDs.
    If probing fails, retries downloading only the failed files.
    """
    import subprocess
    import shlex
    import concurrent.futures
    params = context['params']
    info_json_path = token_data.get('info_json_path')
    proxy = token_data.get('socks_proxy')
    original_url = token_data.get('original_url')
    if not (info_json_path and os.path.exists(info_json_path)):
        raise AirflowException(f"Error: info.json path is missing or file does not exist ({info_json_path}).")
    download_dir = os.path.dirname(info_json_path)
    format_preset = params.get('download_format_preset', 'best_audio')
    if format_preset == 'custom':
        download_format = params.get('download_format_custom')
        if not download_format:
            raise AirflowException("Format preset is 'custom' but no custom format string was provided.")
    elif format_preset == 'best_audio':
        download_format = 'ba[ext=m4a]/bestaudio/best'
    elif format_preset == 'formats_0':
        download_format = '18,140'
    elif format_preset == 'formats_2':
        download_format = '18,140,299/298/137/136/135/134/133'
    elif format_preset == 'formats_3':
        download_format = '18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318'
    else:
        download_format = 'ba[ext=m4a]/bestaudio/best'
    output_template = params.get('output_path_template', "%(title)s [%(id)s].f%(format_id)s.%(ext)s")
    full_output_path = os.path.join(download_dir, output_template)
    retry_on_probe_failure = params.get('retry_on_probe_failure', False)
    def run_yt_dlp_command(format_selector: str):
        """Constructs and runs a yt-dlp command, returning a list of final filenames."""
        cmd = [
            'yt-dlp', '--verbose', '--print-traffic', '--load-info-json', info_json_path,
            '-f', format_selector, '-o', full_output_path,
            '--print', 'filename', '--continue', '--no-progress', '--no-simulate',
            '--no-write-info-json', '--ignore-errors', '--no-playlist',
        ]
        if params.get('fragment_retries'):
            cmd.extend(['--fragment-retries', str(params['fragment_retries'])])
        if params.get('limit_rate'):
            cmd.extend(['--limit-rate', params['limit_rate']])
        if params.get('socket_timeout'):
            cmd.extend(['--socket-timeout', str(params['socket_timeout'])])
        if params.get('min_sleep_interval'):
            cmd.extend(['--min-sleep-interval', str(params['min_sleep_interval'])])
        if params.get('max_sleep_interval'):
            cmd.extend(['--max-sleep-interval', str(params['max_sleep_interval'])])
        if params.get('yt_dlp_test_mode'):
            cmd.append('--test')
        downloader = params.get('downloader', 'default')
        if proxy and not (downloader == 'aria2c' and proxy.startswith('socks5://')):
            cmd.extend(['--proxy', proxy])
        gost_process = None
        try:
            if downloader == 'aria2c':
                cmd.extend(['--downloader', 'aria2c'])
                downloader_args = params.get('downloader_args_aria2c')
                if proxy and proxy.startswith('socks5://'):
                    import socket
                    from contextlib import closing
                    def find_free_port():
                        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
                            s.bind(('', 0))
                            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                            return s.getsockname()[1]
                    local_port = find_free_port()
                    http_proxy = f"http://127.0.0.1:{local_port}"
                    logger.info(f"Starting gost for format '{format_selector}' to forward {proxy} to {http_proxy}")
                    gost_cmd = ['gost', '-L', f'http://127.0.0.1:{local_port}', '-F', proxy]
                    gost_process = subprocess.Popen(gost_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    time.sleep(1)
                    if gost_process.poll() is not None:
                        stdout, stderr = gost_process.communicate()
                        logger.error(f"gost failed to start. Exit: {gost_process.returncode}. Stdout: {stdout.decode()}. Stderr: {stderr.decode()}")
                        raise AirflowException("gost proxy tunnel failed to start.")
                    user_args = downloader_args[len('aria2c:'):] if downloader_args and downloader_args.startswith('aria2c:') else (downloader_args or "")
                    final_args_str = f'aria2c:{user_args.strip()} --http-proxy={http_proxy}'
                    cmd.extend(['--downloader-args', final_args_str])
                elif downloader_args:
                    cmd.extend(['--downloader-args', downloader_args])
            extra_args = params.get('yt_dlp_extra_args')
            if extra_args:
                cmd.extend(shlex.split(extra_args))
            if original_url:
                cmd.append(original_url)
            copy_paste_cmd = ' '.join(shlex.quote(arg) for arg in cmd)
            logger.info(f"Executing yt-dlp command for format '{format_selector}': {copy_paste_cmd}")
            process = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
            if process.stdout:
                logger.info(f"yt-dlp STDOUT for format '{format_selector}':\n{process.stdout}")
            if process.stderr:
                # yt-dlp often prints progress and informational messages to stderr
                logger.info(f"yt-dlp STDERR for format '{format_selector}':\n{process.stderr}")
            if process.returncode != 0:
                logger.error(f"yt-dlp failed for format '{format_selector}' with exit code {process.returncode}")
                # STDOUT and STDERR are already logged above.
                raise AirflowException(f"yt-dlp command failed for format '{format_selector}'.")
            # In test mode, files are not created, so we only check that yt-dlp returned filenames.
            # Otherwise, we verify that the files actually exist on disk.
            output_files = [f for f in process.stdout.strip().split('\n') if f]
            if not params.get('yt_dlp_test_mode'):
                output_files = [f for f in output_files if os.path.exists(f)]
            if not output_files:
                log_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
                           if params.get('yt_dlp_test_mode') else
                           f"Download for format '{format_selector}' finished but no output files exist.")
                exc_msg = (f"Test run for format '{format_selector}' did not produce any filenames."
                           if params.get('yt_dlp_test_mode') else
                           f"Download for format '{format_selector}' did not produce a file.")
                logger.error(log_msg)
                logger.error(f"Full STDOUT:\n{process.stdout}")
                logger.error(f"Full STDERR:\n{process.stderr}")
                raise AirflowException(exc_msg)
            log_prefix = "SUCCESS (Test Mode):" if params.get('yt_dlp_test_mode') else "SUCCESS:"
            logger.info(f"{log_prefix} Command for format '{format_selector}' complete. Files: {output_files}")
            return output_files
        finally:
            if gost_process:
                logger.info(f"Terminating gost process (PID: {gost_process.pid}) for format '{format_selector}'.")
                gost_process.terminate()
                try:
                    gost_process.wait(timeout=5)
                except subprocess.TimeoutExpired:
                    gost_process.kill()
                    gost_process.wait()
    def run_ffmpeg_probe(filename):
        """Probes a file with ffmpeg to check for corruption."""
        logger.info(f"Probing downloaded file: {filename}")
        try:
            subprocess.run(['ffmpeg', '-v', 'error', '-i', filename, '-f', 'null', '-'], check=True, capture_output=True, text=True)
            logger.info(f"SUCCESS: Probe confirmed valid media file: {filename}")
        except subprocess.CalledProcessError as e:
            logger.error(f"ffmpeg probe failed for '{filename}'. File may be corrupt.")
            logger.error(f"ffmpeg STDERR: {e.stderr}")
            raise AirflowException(f"ffmpeg probe failed for {filename}.")
    def _download_and_probe_formats(formats_to_process: list[str] | str):
        """
        Helper to download a list of format IDs (or a single complex selector) and probe the results.
        Returns a tuple of (successful_files, failed_probe_files).
        """
        all_downloaded_files = []
        delay_between_formats = params.get('delay_between_formats_s', 0)
        if isinstance(formats_to_process, list) and formats_to_process:
            logger.info(f"Downloading {len(formats_to_process)} format(s) sequentially: {formats_to_process}")
            for i, fid in enumerate(formats_to_process):
                all_downloaded_files.extend(run_yt_dlp_command(fid))
                if delay_between_formats > 0 and i < len(formats_to_process) - 1:
                    logger.info(f"Waiting {delay_between_formats}s before next format download...")
                    time.sleep(delay_between_formats)
        elif isinstance(formats_to_process, str):
            logger.info(f"Using complex format selector '{formats_to_process}'. Running as a single command.")
            all_downloaded_files = run_yt_dlp_command(formats_to_process)
        if not all_downloaded_files:
            logger.warning("Download process completed but produced no files.")
            return [], []
        if params.get('yt_dlp_test_mode'):
            logger.info("Test mode is enabled. Skipping probe of output files.")
            return all_downloaded_files, []
        if params.get('skip_probe'):
            logger.info("Skipping probe of output files as per configuration.")
            return all_downloaded_files, []
        successful_probes, failed_probes = [], []
        logger.info(f"Probing {len(all_downloaded_files)} downloaded file(s) sequentially...")
        for filename in all_downloaded_files:
            try:
                run_ffmpeg_probe(filename)
                successful_probes.append(filename)
            except Exception:
                failed_probes.append(filename)
        return successful_probes, failed_probes
    # --- Main Execution Logic ---
    with open(info_json_path, 'r', encoding='utf-8') as f:
        info = json.load(f)
    # Split the format string by commas to get a list of individual format selectors.
    # This enables parallel downloads of different formats or format groups.
    # For example, '18,140,299/298' becomes ['18', '140', '299/298'],
    # and each item will be downloaded in a separate yt-dlp process.
    if download_format and isinstance(download_format, str):
        formats_to_download_initial = [selector.strip() for selector in download_format.split(',') if selector.strip()]
    else:
        # Fallback for safety, though download_format should always be a string.
        formats_to_download_initial = []
    if not formats_to_download_initial:
        raise AirflowException("No valid download format selectors were found after parsing.")
    # --- Filter requested formats against available formats ---
    final_formats_to_download = []
    if not available_formats:
        logger.warning("List of available formats is empty. Will attempt to download all requested formats without validation.")
        final_formats_to_download = formats_to_download_initial
    else:
        for selector in formats_to_download_initial:
            # A selector can be '140' or '299/298/137'
            individual_ids = re.split(r'[/+]', selector)
            if any(fid in available_formats for fid in individual_ids):
                final_formats_to_download.append(selector)
            else:
                logger.warning(f"Requested format selector '{selector}' contains no available formats. Skipping.")
    if not final_formats_to_download:
        raise AirflowException("None of the requested formats are available for this video.")
    # --- Initial Download and Probe ---
    successful_files, failed_files = _download_and_probe_formats(final_formats_to_download)
    if params.get('yt_dlp_test_mode'):
        logger.info(f"Test mode: yt-dlp returned {len(successful_files)} filenames. Skipping probe failure checks.")
        if not successful_files:
            raise AirflowException("Test run did not produce any filenames.")
        return successful_files
    if not failed_files:
        if not successful_files:
            raise AirflowException("Download and probe process completed but produced no valid files.")
        return successful_files
    # --- Handle Probe Failures and Retry ---
    if not retry_on_probe_failure:
        raise AirflowException(f"Probe failed for {len(failed_files)} file(s) and retry is disabled: {failed_files}")
    logger.warning(f"Probe failed for {len(failed_files)} file(s). Attempting one re-download for failed files...")
    format_ids_to_retry = []
    # Since each download is now for a specific selector and the output template
    # includes the format_id, we can always attempt to extract the format_id
    # from the failed filename for a targeted retry.
    for f in failed_files:
        match = re.search(r'\.f([\d]+)\.', f)
        if match:
            format_ids_to_retry.append(match.group(1))
        else:
            logger.error(f"Could not extract format_id from failed file '{f}'. Cannot retry this specific file.")
    formats_to_download_retry = format_ids_to_retry
    if not formats_to_download_retry:
        raise AirflowException("Probe failed, but could not determine which formats to retry.")
    # Rename failed files to allow for a fresh download attempt
    for f in failed_files:
        try:
            failed_path = f"{f}.probe_failed_{int(time.time())}"
            os.rename(f, failed_path)
            logger.info(f"Renamed corrupted file to {failed_path}")
        except OSError as rename_err:
            logger.error(f"Could not rename corrupted file '{f}': {rename_err}")
    # --- Retry Download and Probe ---
    retried_successful_files, retried_failed_files = _download_and_probe_formats(formats_to_download_retry)
    if retried_failed_files:
        logger.error(f"Probe failed again for {len(retried_failed_files)} file(s) after retry: {retried_failed_files}")
    final_success_list = successful_files + retried_successful_files
    if not final_success_list:
        raise AirflowException("All files failed to download or probe correctly, even after retry.")
    logger.info(f"Retry complete. Final success count: {len(final_success_list)} file(s).")
    if params.get('yt_dlp_cleanup_mode', True):
        logger.info(f"Cleanup mode is enabled. Creating .empty files and deleting originals for {len(final_success_list)} files.")
        for f in final_success_list:
            try:
                empty_file_path = f"{f}.empty"
                with open(empty_file_path, 'w') as fp:
                    pass  # create empty file
                logger.info(f"Created empty file: {empty_file_path}")
                os.remove(f)
                logger.info(f"Deleted original file: {f}")
            except Exception as e:
                logger.error(f"Error during cleanup for file {f}: {e}", exc_info=True)
                # Do not fail the task for a cleanup error, just log it.
    return final_success_list
@task
 def mark_url_as_success(job_data: dict, downloaded_file_paths: list, **context):
    """Records the successful download result in Redis."""
    params = context['params']
    url = job_data['original_url']
    result_data = {
        'status': 'success', 'end_time': time.time(), 'url': url,
        'downloaded_file_paths': downloaded_file_paths, **job_data,
        'dag_run_id': context['dag_run'].run_id,
    }
    client = _get_redis_client(params['redis_conn_id'])
    # Update activity counters
    try:
        proxy_url = job_data.get('socks_proxy')
        account_id = job_data.get('successful_account_id')
        now = time.time()
        # Use a unique member to prevent collisions, e.g., dag_run_id
        member = context['dag_run'].run_id
        if proxy_url:
            proxy_key = f"activity:per_proxy:{proxy_url}"
            client.zadd(proxy_key, {member: now})
            client.expire(proxy_key, 3600 * 2)  # Expire after 2 hours
        if account_id:
            account_key = f"activity:per_account:{account_id}"
            client.zadd(account_key, {member: now})
            client.expire(account_key, 3600 * 2)  # Expire after 2 hours
    except Exception as e:
        logger.error(f"Could not update activity counters: {e}", exc_info=True)
    # Update client-specific stats
    try:
        machine_id = params.get('machine_id') or socket.gethostname()
        clients_str = job_data.get('clients', params.get('clients', '')) # Prefer clients from job, fallback to params
        _update_client_stats(client, clients_str, 'success', url, machine_id, context['dag_run'].run_id)
    except Exception as e:
        logger.error(f"Could not update client stats on success: {e}", exc_info=True)
    progress_queue = f"{params['queue_name']}_progress"
    result_queue = f"{params['queue_name']}_result"
    with client.pipeline() as pipe:
        pipe.hset(result_queue, url, json.dumps(result_data))
        pipe.hdel(progress_queue, url)
        pipe.execute()
    logger.info(f"Stored success result for URL '{url}' and removed from progress queue.")
@task(trigger_rule='one_failed')
 def report_failure_and_continue(**context):
    """
    Handles a failed download attempt by recording an error report to Redis.
    """
    params = context['params']
    ti = context['task_instance']
    job_data = params.get('job_data', {})
    if isinstance(job_data, str):
        try:
            job_data = json.loads(job_data)
        except json.JSONDecodeError:
            job_data = {}
    url = job_data.get('original_url', 'unknown')
    # No token errors to collect, just report a generic download failure.
    error_details = {'error_message': 'Download or probe stage failed.'}
    logger.error(f"A failure occurred while processing URL '{url}'. Reporting to Redis.")
    result_data = {
        'status': 'failed',
        'end_time': time.time(),
        'url': url,
        'dag_run_id': context['dag_run'].run_id,
        'error_details': error_details
    }
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            clients_str = job_data.get('clients', params.get('clients', '')) # Prefer clients from job, fallback to params
            _update_client_stats(client, clients_str, 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on failure: {e}", exc_info=True)
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
        logger.info(f"Stored failure result for URL '{url}' in '{result_queue}' and '{fail_queue}' and removed from progress queue.")
    except Exception as e:
        logger.error(f"Could not report failure to Redis: {e}", exc_info=True)
@task(trigger_rule='one_failed')
 def handle_fatal_error(**context):
    """
    Handles fatal, non-retryable errors (e.g., infrastructure issues).
    This task reports the failure to Redis to ensure failed URLs are queued
    for later reprocessing, but allows the processing loop to continue.
    """
    params = context['params']
    ti = context['task_instance']
    job_data = params.get('job_data', {})
    if isinstance(job_data, str):
        try:
            job_data = json.loads(job_data)
        except json.JSONDecodeError:
            job_data = {}
    url = job_data.get('original_url', 'unknown')
    error_details = {'error_message': 'Fatal error during download stage.'}
    logger.error(f"A fatal, non-retryable error occurred for URL '{url}'. See previous task logs for details.")
    # Report failure to Redis so the URL can be reprocessed later
    try:
        client = _get_redis_client(params['redis_conn_id'])
        # Update client-specific stats
        try:
            machine_id = params.get('machine_id') or socket.gethostname()
            clients_str = job_data.get('clients', params.get('clients', '')) # Prefer clients from job, fallback to params
            _update_client_stats(client, clients_str, 'failure', url, machine_id, context['dag_run'].run_id)
        except Exception as e:
            logger.error(f"Could not update client stats on fatal error: {e}", exc_info=True)
        result_data = {
            'status': 'failed',
            'end_time': time.time(),
            'url': url,
            'dag_run_id': context['dag_run'].run_id,
            'error': 'fatal_error',
            'error_message': 'Fatal non-retryable error occurred',
            'error_details': error_details
        }
        result_queue = f"{params['queue_name']}_result"
        fail_queue = f"{params['queue_name']}_fail"
        progress_queue = f"{params['queue_name']}_progress"
        with client.pipeline() as pipe:
            pipe.hset(result_queue, url, json.dumps(result_data))
            pipe.hset(fail_queue, url, json.dumps(result_data))
            pipe.hdel(progress_queue, url)
            pipe.execute()
        logger.info(f"Stored fatal error result for URL '{url}' in '{result_queue}' and '{fail_queue}' for later reprocessing.")
    except Exception as e:
        logger.error(f"Could not report fatal error to Redis: {e}", exc_info=True)
    # Do not fail the DAG run. Allow the processing loop to continue.
    logger.warning("A fatal error was handled, but the DAG is configured to continue the processing loop.")
@task(trigger_rule='one_success')
 def continue_processing_loop(**context):
    """
    After a successful run, triggers a new dispatcher to continue the processing loop,
    effectively asking for the next URL to be processed.
    """
    params = context['params']
    dag_run = context['dag_run']
    # Do not continue the loop for manual runs of the worker DAG.
    # A worker DAG triggered by the dispatcher will have a run_id starting with 'worker_run_'.
    if not dag_run.run_id.startswith('worker_run_'):
        logger.info(f"DAG run '{dag_run.run_id}' does not appear to be triggered by the dispatcher. Stopping processing loop.")
        return
    # Create a new unique run_id for the dispatcher.
    # Using a timestamp and UUID ensures the ID is unique and does not grow in length over time,
    # preventing database errors.
    new_dispatcher_run_id = f"retriggered_by_worker_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
    # Pass all original parameters from the orchestrator through to the new dispatcher run.
    conf_to_pass = {k: v for k, v in params.items() if v is not None}
    # The new dispatcher will pull its own job data and determine its own queue, so we don't pass these.
    conf_to_pass.pop('job_data', None)
    conf_to_pass.pop('worker_queue', None)
    logger.info(f"Worker finished successfully. Triggering a new dispatcher ('{new_dispatcher_run_id}') to continue the loop.")
    trigger_dag(
        dag_id='ytdlp_ops_v02_dispatcher_dl',
        run_id=new_dispatcher_run_id,
        conf=conf_to_pass,
        replace_microseconds=False
    )
@task.branch(trigger_rule='one_failed')
 def handle_download_failure_branch(**context):
    """If download or probe fails, routes to the standard failure reporting."""
    logger.warning("Download or probe failed. Reporting failure and continuing loop.")
    return 'report_failure_and_continue'
 # =============================================================================
 # DAG Definition with TaskGroups
 # =============================================================================
 with DAG(
    dag_id='ytdlp_ops_v02_worker_per_url_dl',
    default_args=DEFAULT_ARGS,
    schedule=None,
    start_date=days_ago(1),
    catchup=False,
    tags=['ytdlp', 'worker'],
    doc_md=__doc__,
    render_template_as_native_obj=True,
    is_paused_upon_creation=True,
    params={
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string"),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string"),
        'machine_id': Param(None, type=["string", "null"]),
        'clients': Param('mweb,web_camoufox,tv', type="string", description="Comma-separated list of clients for token generation. e.g. mweb,tv,web_camoufox"),
        'output_path_template': Param("%(title)s [%(id)s].f%(format_id)s.%(ext)s", type="string", title="[Worker Param] Output Path Template", description="Output filename template for yt-dlp. It is highly recommended to include `%(format_id)s` to prevent filename collisions when downloading multiple formats."),
        'retry_on_probe_failure': Param(False, type="boolean"),
        'skip_probe': Param(False, type="boolean", title="[Worker Param] Skip Probe", description="If True, skips the ffmpeg probe of downloaded files."),
        'yt_dlp_cleanup_mode': Param(True, type="boolean", title="[Worker Param] yt-dlp Cleanup Mode", description="If True, creates a .empty file and deletes the original media file after successful download and probe."),
        'delay_between_formats_s': Param(15, type="integer", title="[Worker Param] Delay Between Formats (s)", description="Delay in seconds between downloading each format when multiple formats are specified. A 22s wait may be effective for batch downloads, while 6-12s may suffice if cookies are refreshed regularly."),
        'yt_dlp_test_mode': Param(False, type="boolean", title="[Worker Param] yt-dlp Test Mode", description="If True, runs yt-dlp with --test flag (dry run without downloading)."),
        'fragment_retries': Param(10, type="integer", title="[Worker Param] Fragment Retries", description="Number of retries for a fragment before giving up."),
        'limit_rate': Param('5M', type=["string", "null"], title="[Worker Param] Limit Rate", description="Download speed limit (e.g., 50K, 4.2M)."),
        'socket_timeout': Param(15, type="integer", title="[Worker Param] Socket Timeout", description="Timeout in seconds for socket operations."),
        'min_sleep_interval': Param(5, type="integer", title="[Worker Param] Min Sleep Interval", description="Minimum time to sleep between downloads (seconds)."),
        'max_sleep_interval': Param(10, type="integer", title="[Worker Param] Max Sleep Interval", description="Maximum time to sleep between downloads (seconds)."),
        'download_format_preset': Param(
            'formats_2',
            type="string",
            enum=['best_audio', 'formats_0', 'formats_2', 'formats_3', 'custom'],
            title="Download Format Preset",
            description="Select a predefined format string or choose 'custom'. To download multiple formats, this should be a comma-separated list of format IDs (e.g., '137,140').\nformats_0: 18,140\nformats_2: 18,140,299/298/137/136/135/134/133\nformats_3: 18,599,139,140,141,160/269,133/229,134/230,135/231,136/232,137/270,298/311,299/318"
        ),
        'download_format_custom': Param(
            'ba[ext=m4a]/bestaudio/best',
            type="string",
            title="Custom Download Format",
            description="Custom yt-dlp format string. Used when preset is 'custom'. To download multiple formats, provide a comma-separated list of format IDs (e.g., '137,140')."
        ),
        'downloader': Param(
            'default',
            type="string",
            enum=['default', 'aria2c'],
            title="Downloader",
            description="Choose the downloader for yt-dlp."
        ),
        'downloader_args_aria2c': Param(
            'aria2c:-x 4 -k 2M --max-download-limit=3M',
            type="string",
            title="Aria2c Downloader Arguments",
            description="Arguments to pass to yt-dlp's --downloader-args. Used when downloader is 'aria2c'."
        ),
        'yt_dlp_extra_args': Param(
            '--no-part --restrict-filenames',
            type=["string", "null"],
            title="Extra yt-dlp arguments",
            description="Extra command-line arguments for yt-dlp during download."
        ),
        # --- Manual Run / Internal Parameters ---
        'job_data': Param(None, type=["object", "string", "null"], title="[Internal] Job Data from Dispatcher", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
        'worker_queue': Param(None, type=["string", "null"], title="[Internal] Worker Queue", description="This parameter is set by the dispatcher DAG and should not be used for manual runs."),
    }
 ) as dag:
    job_data = get_download_job_from_conf()
    # --- Task Instantiation ---
    # Main success/failure handlers
    fatal_error_task = handle_fatal_error()
    report_failure_task = report_failure_and_continue()
    continue_loop_task = continue_processing_loop()
    # --- Download and Processing Group ---
    with TaskGroup("download_processing", tooltip="Download and media processing") as download_processing_group:
        list_formats_task = list_available_formats(token_data=job_data)
        download_task = download_and_probe(
            token_data=job_data,
            available_formats=list_formats_task,
        )
        download_branch_task = handle_download_failure_branch.override(trigger_rule='one_failed')()
        success_task = mark_url_as_success(
            job_data=job_data,
            downloaded_file_paths=download_task,
        )
        list_formats_task >> download_task
        download_task >> download_branch_task
        download_branch_task >> report_failure_task
        download_task >> success_task
        success_task >> continue_loop_task
    # If the initial job setup succeeds, proceed to the download group.
    # If it fails, trigger the fatal error handler. This prevents fatal_error_task
    # from being an "island" task that gets triggered by any other failure in the DAG.
    job_data.operator >> download_processing_group
    job_data.operator >> fatal_error_task
    # Any failure path should continue the loop to process the next URL.
    report_failure_task >> continue_loop_task
    fatal_error_task >> continue_loop_task
--- a/ansible/MIGRATION.md
+++ b/ansible/MIGRATION.md
@ -0,0 +1,9 @@
 # Migration Notes
 This document tracks the process of migrating the Ansible deployment.
 ## Guiding Principles
 - No changes to business logic or core functionality are permitted during this phase.
 - The focus is solely on resolving file path issues, dependency errors, and structural inconsistencies resulting from the migration of a subset of files.
 - All changes should be aimed at making the existing playbooks runnable in the new environment.
--- a/ansible/README-yt.md
+++ b/ansible/README-yt.md
@ -0,0 +1,120 @@
 # Ansible-driven YT-DLP / Airflow Cluster – Quick-Start & Cheat-Sheet
 > One playbook = one command to **deploy**, **update**, **restart**, or **re-configure** the entire cluster.
 ---
 ## 0. Prerequisites (run once on the **tower** server)
 ```
 ---
 ## 1. Ansible Vault Setup (run once on your **local machine**)
 This project uses Ansible Vault to encrypt sensitive data like passwords and API keys. To run the playbooks, you need to provide the vault password. The recommended way is to create a file named `.vault_pass` in the root of the project directory.
 1.  **Create the Vault Password File:**
    From the project's root directory (e.g., `/opt/yt-ops-services`), create the file. The file should contain only your vault password on a single line.
    ```bash
    # Replace 'your_secret_password_here' with your actual vault password
    echo "your_secret_password_here" > .vault_pass
    ```
 2.  **Secure the File:**
    It's good practice to restrict permissions on this file so only you can read it.
    ```bash
    chmod 600 .vault_pass
    ```
 The `ansible.cfg` file is configured to automatically look for this `.vault_pass` file in the project root.
 ---
 ## 1.5. Cluster & Inventory Management
 The Ansible inventory (`ansible/inventory.ini`), host-specific variables (`ansible/host_vars/`), and the master `docker-compose.yaml` are dynamically generated from a central cluster definition file (e.g., `cluster.yml`).
 **Whenever you add, remove, or change the IP of a node in your `cluster.yml`, you must re-run the generator script.**
 1.  **Install Script Dependencies (run once):**
    The generator script requires `PyYAML` and `Jinja2`. Install them using pip:
    ```bash
    pip3 install PyYAML Jinja2
    ```
 2.  **Edit Your Cluster Definition:**
    Modify your `cluster.yml` file (located in the project root) to define your master and worker nodes.
 3.  **Run the Generator Script:**
    From the project's root directory, run the following command to update all generated files:
    ```bash
    # Make sure the script is executable first: chmod +x tools/generate-inventory.py
    ./tools/generate-inventory.py cluster.yml
    ```
 This ensures that Ansible has the correct host information and that the master node's Docker Compose configuration includes the correct `extra_hosts` for log fetching from workers.
 ---
 ## 2. Setup and Basic Usage
 ### Running Ansible Commands
 **IMPORTANT:** All `ansible-playbook` commands should be run from within the `ansible/` directory. This allows Ansible to automatically find the `ansible.cfg` and `inventory.ini` files.
 ```bash
 cd ansible
 ansible-playbook <playbook_name>.yml
 ```
 The `ansible.cfg` file is configured to automatically use the `.vault_pass` file located in the project root (one level above `ansible/`). This means you **do not** need to manually specify `--vault-password-file ../.vault_pass` in your commands. Ensure your `.vault_pass` file is located in the project root.
 If you run `ansible-playbook` from the project root instead of the `ansible/` directory, you will see warnings about the inventory not being parsed, because Ansible does not automatically find `ansible/ansible.cfg`.
 ---
 ## 3. Deployment Scenarios
 ### Full Cluster Deployment
 To deploy or update the entire cluster (master and all workers), run the main playbook. This will build/pull images and restart all services.
 ```bash
 # Run from inside the ansible/ directory
 ansible-playbook playbook-full.yml
 ```
 ### Targeted & Fast Deployments
 For faster development cycles, you can deploy changes to specific parts of the cluster without rebuilding or re-pulling Docker images.
 #### Updating Only the Master Node (Fast Deploy)
 To sync configuration, code, and restart services on the master node *without* rebuilding the Airflow image or pulling the `ytdlp-ops-server` image, use the `fast_deploy` flag with the master playbook. This is ideal for pushing changes to DAGs, Python code, or config files.
 ```bash
 # Run from inside the ansible/ directory
 ansible-playbook playbook-master.yml --extra-vars "fast_deploy=true"
 ```
 #### Updating Only a Specific Worker Node (Fast Deploy)
 Similarly, you can update a single worker node. Replace `dl001` with the hostname of the worker you want to target from your `inventory.ini`.
 ```bash
 # Run from inside the ansible/ directory
 ansible-playbook playbook-worker.yml --limit dl001 --extra-vars "fast_deploy=true"
 ```
 #### Updating Only DAGs and Configs
 If you have only changed DAGs or configuration files and don't need to restart any services, you can run a much faster playbook that only syncs the `dags/` and `config/` directories.
 ```bash
 # Run from inside the ansible/ directory
 ansible-playbook playbook-dags.yml
 ```
--- a/ansible/group_vars/all/vault.yml
+++ b/ansible/group_vars/all/vault.yml
@ -6,3 +6,5 @@ vault_vnc_password: "vnc_pwd_Z5xW8cV2bN4mP7lK"
 vault_ss_password_1: "UCUAR7vRO/u9Zo71nfA13c+/b1MCiJpfZJo+EmEBCfA="
 vault_ss_password_2: "tgtQcfjJp/A3F01g4woO0bEQoxij3CAOK/iR1OTPuF4="
 vault_dockerhub_password: "dckr_pat_DmFFqwFEdXFvZlgngGY9ooBaq6o"
 vault_s3_access_key_id: "admin"
 vault_s3_secret_access_key: "0153093693-0009"
--- a/ansible/playbook-dags.yml
+++ b/ansible/playbook-dags.yml
@ -60,3 +60,4 @@
      loop:
        - "airflow.cfg"
        - "custom_task_hooks.py"
--- a/ansible/playbook-full.yml
+++ b/ansible/playbook-full.yml
@ -111,6 +111,53 @@
        name: airflow_proxynet
        driver: bridge
  post_tasks:
    - name: Sync custom_task_hooks.py to MASTER server
      when: inventory_hostname in groups['airflow_master']
      synchronize:
        src: "../airflow/config/custom_task_hooks.py"
        dest: "{{ airflow_master_dir }}/config/"
        archive: yes
        rsync_path: "sudo rsync"
    - name: Sync airflow_local_settings.py to MASTER server
      when: inventory_hostname in groups['airflow_master']
      synchronize:
        src: "../airflow/config/airflow_local_settings.py"
        dest: "{{ airflow_master_dir }}/config/"
        archive: yes
        rsync_path: "sudo rsync"
    - name: Sync custom_task_hooks.py to WORKER server
      when: inventory_hostname in groups['airflow_workers']
      synchronize:
        src: "../airflow/config/custom_task_hooks.py"
        dest: "{{ airflow_worker_dir }}/config/"
        archive: yes
        rsync_path: "sudo rsync"
    - name: Sync airflow_local_settings.py to WORKER server
      when: inventory_hostname in groups['airflow_workers']
      synchronize:
        src: "../airflow/config/airflow_local_settings.py"
        dest: "{{ airflow_worker_dir }}/config/"
        archive: yes
        rsync_path: "sudo rsync"
    - name: Restart Airflow services on MASTER to apply hook
      when: inventory_hostname in groups['airflow_master']
      ansible.builtin.command:
        cmd: "docker compose restart airflow-scheduler airflow-webserver airflow-master-worker airflow-triggerer"
        chdir: "{{ airflow_master_dir }}"
      become: yes
    - name: Restart Airflow worker on WORKER to apply hook
      when: inventory_hostname in groups['airflow_workers']
      ansible.builtin.command:
        cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
        chdir: "{{ airflow_worker_dir }}"
      become: yes
 - name: Deploy master
  import_playbook: playbook-master.yml
  when: inventory_hostname in groups['airflow_master']
--- a/ansible/playbook-hook.yml
+++ b/ansible/playbook-hook.yml
@ -48,6 +48,6 @@
    - name: Restart Airflow worker on WORKER
      when: inventory_hostname in groups['airflow_workers']
      ansible.builtin.command:
-        cmd: "docker compose restart airflow-worker"
+        cmd: "docker compose restart airflow-worker-dl airflow-worker-auth"
        chdir: "{{ airflow_worker_dir }}"
      become: yes
--- a/ansible/playbook-master.yml
+++ b/ansible/playbook-master.yml
@ -144,6 +144,42 @@
        deploy_group_gid: "0"
      when: deploy_group_gid is not defined or deploy_group_gid == ""
    - name: Generate Docker Compose configurations
      ansible.builtin.command: >
        docker compose --project-directory . -f configs/docker-compose.config-generate.yaml run --rm config-generator
      args:
        chdir: "{{ airflow_master_dir }}"
      become: yes
      become_user: "{{ ansible_user }}"
      register: config_generator_result
      changed_when: "'Creating' in config_generator_result.stdout or 'Recreating' in config_generator_result.stdout"
    - name: Show config generator output
      ansible.builtin.debug:
        var: config_generator_result.stdout_lines
      when: config_generator_result.changed
    - name: Ensure Airflow project directory is writable by the container user (UID 50000)
      ansible.builtin.file:
        path: "{{ airflow_master_dir }}"
        owner: 50000
        group: 50000
      become: yes
    - name: Ensure Airflow subdirectories are writable by the container user (UID 50000)
      ansible.builtin.file:
        path: "{{ item }}"
        owner: 50000
        group: 50000
        recurse: yes
        state: directory
      loop:
        - "{{ airflow_master_dir }}/dags"
        - "{{ airflow_master_dir }}/logs"
        - "{{ airflow_master_dir }}/plugins"
        - "{{ airflow_master_dir }}/config"
      become: yes
  tasks:
    - name: Install pipx
      ansible.builtin.apt:
@ -170,3 +206,23 @@
    - name: Include camoufox verification tasks
      include_tasks: tasks/verify_camoufox.yml
      when: not fast_deploy | default(false)
    - name: Run regression test
      command: >
        docker exec -i airflow-regression-runner python3 /opt/airflow/dags/scripts/regression.py
        --client "{{ regression_client | default('mweb') }}"
        --workers {{ regression_workers | default(4) }}
        --workers-per-bunch {{ regression_workers_per_bunch | default(4) }}
        --run-time-min {{ regression_run_time_min | default(120) }}
        --input-file "{{ regression_input_file | default('/opt/airflow/inputfiles/video_ids.csv') }}"
        --progress-interval-min {{ regression_progress_interval_min | default(2) }}
        --report-file "{{ regression_report_file | default('/opt/airflow/downloadfiles/regression_report.csv') }}"
        {% if regression_cleanup | default(true) %}--cleanup{% endif %}
      register: regression_test_result
      changed_when: false
      when: run_regression_test | default(false)
    - name: Display regression test output
      debug:
        var: regression_test_result.stdout_lines
      when: run_regression_test | default(false)
--- a/ansible/playbook-sync-local.yml
+++ b/ansible/playbook-sync-local.yml
@ -0,0 +1,108 @@
 ---
 - name: Sync Local Development Files to Workers
  hosts: airflow_workers
  gather_facts: no
  vars_files:
    - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
  pre_tasks:
    - name: Announce local sync
      debug:
        msg: "Syncing local dev files to {{ inventory_hostname }} at {{ airflow_worker_dir }}"
  tasks:
    - name: Check if yt-dlp is installed
      ansible.builtin.command: which yt-dlp
      register: ytdlp_check
      changed_when: false
      failed_when: false
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Install yt-dlp if not found
      ansible.builtin.command: python3 -m pip install -U "yt-dlp[default]" --break-system-packages
      when: ytdlp_check.rc != 0
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Sync thrift_model directory to workers
      ansible.posix.synchronize:
        src: ../thrift_model/
        dest: "{{ airflow_worker_dir }}/thrift_model/"
        rsync_opts:
          - "--delete"
          - "--exclude=.DS_Store"
          - "--exclude=__pycache__"
          - "--exclude='*.pyc'"
        recursive: yes
        perms: yes
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Sync pangramia package to workers
      ansible.posix.synchronize:
        src: ../pangramia/
        dest: "{{ airflow_worker_dir }}/pangramia/"
        rsync_opts:
          - "--delete"
          - "--exclude=.DS_Store"
          - "--exclude=__pycache__"
          - "--exclude='*.pyc'"
        recursive: yes
        perms: yes
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Sync ytops_client directory to workers
      ansible.posix.synchronize:
        src: ../ytops_client/
        dest: "{{ airflow_worker_dir }}/ytops_client/"
        rsync_opts:
          - "--delete"
          - "--exclude=.DS_Store"
          - "--exclude=__pycache__"
          - "--exclude='*.pyc'"
        recursive: yes
        perms: yes
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Sync policies directory to workers
      ansible.posix.synchronize:
        src: ../policies/
        dest: "{{ airflow_worker_dir }}/policies/"
        rsync_opts:
          - "--delete"
          - "--exclude=.DS_Store"
          - "--exclude=__pycache__"
          - "--exclude='*.pyc'"
        recursive: yes
        perms: yes
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Ensure bin directory exists on workers for client utilities
      ansible.builtin.file:
        path: "{{ airflow_worker_dir }}/bin"
        state: directory
        mode: '0755'
      become: yes
      become_user: "{{ ansible_user }}"
    - name: Sync client utility scripts to workers
      ansible.posix.synchronize:
        src: "../{{ item }}"
        dest: "{{ airflow_worker_dir }}/{{ item }}"
        perms: yes
      loop:
        - "README.client.md"
        - "cli.config"
        - "format_download.py"
        - "get_info_json_client.py"
        - "list_formats.py"
        - "stress_test_formats.py"
        - "stress_enhanced.py"
        - "package_client.py"
        - "bin/ytops-client"
      become: yes
      become_user: "{{ ansible_user }}"
--- a/ansible/playbook-update-regression-script.yml
+++ b/ansible/playbook-update-regression-script.yml
@ -0,0 +1,27 @@
 ---
 - name: Update Regression Test Script
  hosts: airflow_master
  gather_facts: no
  vars:
    # This should be the root directory of your project on the master host.
    # It's set as a variable so you can override it if needed, e.g.,
    # ansible-playbook ... -e "project_dir=/path/to/your/project"
    project_dir: "/srv/airflow_master"
  tasks:
    - name: Copy latest regression.py script to the master host
      copy:
        src: ../airflow/dags/scripts/regression.py
        dest: "{{ project_dir }}/dags/scripts/regression.py"
        owner: "{{ ansible_user }}"
        group: "ytdl" # Assuming the same deploy group as the main playbook
        mode: '0644'
      become: yes
      notify:
        - Announce completion
  handlers:
    - name: Announce completion
      listen: "Announce completion"
      debug:
        msg: "Regression script has been updated on {{ inventory_hostname }}. You can now run it using 'docker exec'."
--- a/ansible/playbook-worker.yml
+++ b/ansible/playbook-worker.yml
@ -8,7 +8,7 @@
  pre_tasks:
    - name: Announce worker deployment
      debug:
-        msg: "Starting deployment for Airflow Worker: {{ inventory_hostname }} ({{ ansible_host }})"
+        msg: "Starting deployment for Airflow Worker: {{ inventory_hostname }} ({{ ansible_user }}@{{ ansible_host }})"
    - name: Configure system timezone
      # Ensures all services and logs on this node use a consistent timezone.
@ -129,6 +129,96 @@
      become: yes
      when: limits_sysctl_config_copy.changed
    - name: Create logs directory structure relative to deployment
      file:
        path: "./logs/yt-dlp-ops/communication_logs"
        state: directory
        mode: '0755'
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
      become: yes
    - name: Build local Docker images (e.g., camoufox)
      ansible.builtin.command: >
        docker compose --project-directory . -f configs/docker-compose-ytdlp-ops.yaml build
      args:
        chdir: "{{ airflow_worker_dir }}"
      become: yes
      become_user: "{{ ansible_user }}"
      register: docker_build_result
      changed_when: "'Building' in docker_build_result.stdout or 'writing image' in docker_build_result.stdout"
    - name: Pull pre-built Docker images for ytdlp-ops services
      ansible.builtin.command: >
        docker compose --project-directory . -f configs/docker-compose-ytdlp-ops.yaml pull --ignore-buildable
      args:
        chdir: "{{ airflow_worker_dir }}"
      become: yes
      become_user: "{{ ansible_user }}"
      register: docker_pull_result
      retries: 3
      delay: 10
      changed_when: "'Pulling' in docker_pull_result.stdout or 'Downloaded' in docker_pull_result.stdout"
    - name: Show docker pull output
      ansible.builtin.debug:
        var: docker_pull_result.stdout_lines
      when: docker_pull_result.changed
    - name: Ensure Airflow project directory is writable by the container user (UID 50000)
      ansible.builtin.file:
        path: "{{ airflow_worker_dir }}"
        owner: 50000
        group: 50000
      become: yes
    - name: Ensure Airflow subdirectories are writable by the container user (UID 50000)
      ansible.builtin.file:
        path: "{{ item }}"
        owner: 50000
        group: 50000
        recurse: yes
        state: directory
      loop:
        - "{{ airflow_worker_dir }}/dags"
        - "{{ airflow_worker_dir }}/logs"
        - "{{ airflow_worker_dir }}/plugins"
        - "{{ airflow_worker_dir }}/config"
      become: yes
    - name: Create .dockerignore on worker to exclude runtime data from build context
      ansible.builtin.copy:
        dest: "{{ airflow_worker_dir }}/.dockerignore"
        content: |
          # Exclude build artifacts and virtual environments
          __pycache__/
          *.pyc
          *.pyo
          .venv/
          venv/
          # Exclude sensitive information
          .env
          .vault_pass
          # Exclude local development and OS-specific files
          .DS_Store
          .idea/
          *.swp
          # Exclude large directories with runtime data that should not be in the image
          logs/
          downloadfiles/
          addfiles/
          *downloads/
          postgres-data/
          redis-data/
          minio-data/
        owner: "{{ ansible_user }}"
        group: "{{ deploy_group }}"
        mode: '0644'
      become: yes
  tasks:
    - name: Install pipx
      ansible.builtin.apt:
--- a/ansible/playbook-ytdlp-master-only.yml
+++ b/ansible/playbook-ytdlp-master-only.yml
@ -0,0 +1,22 @@
 ---
 - name: Deploy YTDLP Master Services (Management Role Only)
  hosts: airflow_master
  gather_facts: no
  vars_files:
    - "{{ inventory_dir }}/group_vars/all/generated_vars.yml"
    - "{{ inventory_dir }}/group_vars/all/vault.yml"
  tasks:
    - name: Announce ytdlp-master-only deployment
      debug:
        msg: "Starting deployment for YTDLP Master services on: {{ inventory_hostname }}"
    - name: Start/Redeploy ytdlp-ops services without camoufox
      community.docker.docker_compose_v2:
        project_src: "{{ airflow_master_dir }}"
        files:
          - configs/docker-compose-ytdlp-ops.yaml
        state: present
        remove_orphans: true
        recreate: always
        pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
      become: yes
--- a/ansible/playbooks/playbook-bgutils-start.yml
+++ b/ansible/playbooks/playbook-bgutils-start.yml
@ -0,0 +1,19 @@
 ---
 - name: Start bgutil-provider service
  hosts: all # Use --limit to target specific hosts, e.g., --limit management
  become: true
  gather_facts: false
  vars:
    container_name: "bgutil-provider"
  tasks:
    - name: "Ensure {{ container_name }} container is started"
      community.docker.docker_container:
        name: "{{ container_name }}"
        state: started
      register: container_status
    - name: "Display container status"
      ansible.builtin.debug:
        msg: "{{ container_name }} was started."
      when: container_status.changed
--- a/ansible/playbooks/playbook-bgutils-stop.yml
+++ b/ansible/playbooks/playbook-bgutils-stop.yml
@ -0,0 +1,19 @@
 ---
 - name: Stop bgutil-provider service
  hosts: all # Use --limit to target specific hosts, e.g., --limit management
  become: true
  gather_facts: false
  vars:
    container_name: "bgutil-provider"
  tasks:
    - name: "Ensure {{ container_name }} container is stopped"
      community.docker.docker_container:
        name: "{{ container_name }}"
        state: stopped
      register: container_status
    - name: "Display container status"
      ansible.builtin.debug:
        msg: "{{ container_name }} was stopped."
      when: container_status.changed
--- a/ansible/playbooks/restart_worker.yml
+++ b/ansible/playbooks/restart_worker.yml
@ -0,0 +1,53 @@
 ---
 - name: Restart and Update ytdlp-ops Worker
  hosts: all:!af-green
  vars:
    # This should be the root directory of your project on the target worker machine.
    project_dir: "{{ '/srv/airflow_master' if inventory_hostname == 'af-green' else '/srv/airflow_dl_worker' }}"
    # This is the path to your compose file, relative to the project_dir.
    compose_file: "configs/docker-compose-ytdlp-ops.yaml"
    # The specific image to pull for updates.
    service_image: "pangramia/ytdlp-ops-server:4.0.1"
  tasks:
    - name: "Ensure project directory exists"
      ansible.builtin.file:
        path: "{{ project_dir }}"
        state: directory
        mode: '0755'
      become: yes
    - name: "Copy get_info_json_client.py to worker"
      ansible.builtin.copy:
        src: ../../get_info_json_client.py
        dest: "{{ project_dir }}/get_info_json_client.py"
        mode: '0755'
      become: yes
    - name: "Pull the latest image for the ytdlp-ops service"
      community.docker.docker_image:
        name: "{{ service_image }}"
        source: pull
      tags:
        - pull
    - name: "Take down the ytdlp-ops services"
      community.docker.docker_compose_v2:
        project_src: "{{ project_dir }}"
        files:
          - "{{ compose_file }}"
        state: absent
        remove_volumes: true
      tags:
        - down
    - name: "Bring up the ytdlp-ops services"
      community.docker.docker_compose_v2:
        project_src: "{{ project_dir }}"
        files:
          - "{{ compose_file }}"
        state: present
        recreate: always # Corresponds to --force-recreate
        build: never
      tags:
        - up
--- a/ansible/roles/ytdlp-worker/defaults/main.yml
+++ b/ansible/roles/ytdlp-worker/defaults/main.yml
@ -0,0 +1,3 @@
 ---
 # defaults file for ytdlp-worker
 camoufox_base_port: 10000
--- a/ansible/roles/ytdlp-worker/tasks/main.yml
+++ b/ansible/roles/ytdlp-worker/tasks/main.yml
@ -101,6 +101,22 @@
    - "envoy.yaml.j2"
    - "docker-compose.camoufox.yaml.j2"
 - name: Sync Airflow build context to worker
  synchronize:
    src: "../{{ item }}"
    dest: "{{ airflow_worker_dir }}/"
    archive: yes
    recursive: yes
    rsync_path: "sudo rsync"
    rsync_opts: "{{ rsync_default_opts }}"
  loop:
    - "airflow/Dockerfile"
    - "setup.py"
    - "VERSION"
    - "yt_ops_services"
    - "thrift_model"
    - "pangramia"
 - name: Create .env file for YT-DLP worker service
  template:
    src: "../../templates/.env.j2"
@ -179,6 +195,20 @@
    group: "{{ deploy_group }}"
  become: yes
 - name: "Log: Building Airflow image"
  debug:
    msg: "Building the Airflow image locally. This image contains all dependencies for running DAGs."
 - name: Build Airflow image from local Dockerfile
  community.docker.docker_image:
    name: "pangramia/ytdlp-ops-airflow:latest"
    build:
      path: "{{ airflow_worker_dir }}"
      dockerfile: "Dockerfile"
    source: build
    force_source: true
  when: not fast_deploy | default(false)
 - name: "Log: Building Camoufox (remote browser) image"
  debug:
    msg: "Building the Camoufox image locally. This image provides remote-controlled Firefox browsers for token generation."
@ -206,6 +236,27 @@
    path: "/srv/shadowsocks-rust/docker-compose.proxies.yaml"
  register: proxy_compose_file
 - name: "Log: Stopping worker services before start"
  debug:
    msg: "Stopping all worker services to ensure a clean start."
 - name: Stop all worker services
  community.docker.docker_compose_v2:
    project_src: "{{ airflow_worker_dir }}"
    files:
      - "configs/docker-compose-ytdlp-ops.yaml"
      - "configs/docker-compose.camoufox.yaml"
      - "configs/docker-compose.airflow.yml"
    state: absent
    remove_volumes: true # Corresponds to docker compose down -v
 - name: Forcefully remove project-specific Docker volumes to fix corruption issues
  ansible.builtin.shell: "docker volume ls -q --filter 'label=com.docker.compose.project=ytdlp-ops-worker' | xargs -r docker volume rm --force"
  become: yes
  register: removed_volumes
  changed_when: removed_volumes.stdout | length > 0
  failed_when: false
 - name: "Log: Starting all worker services"
  debug:
    msg: "Starting all worker services: ytdlp-ops, camoufox, and airflow-worker."
@ -220,6 +271,7 @@
    state: present
    remove_orphans: true
    pull: "{{ 'never' if fast_deploy | default(false) else 'missing' }}"
    recreate: always # Corresponds to --force-recreate
 - name: Include camoufox verification tasks
  include_tasks: ../../../tasks/verify_camoufox.yml
--- a/ansible/templates/.env.j2
+++ b/ansible/templates/.env.j2
@ -29,6 +29,14 @@ FLOWER_PASSWORD="{{ vault_flower_password }}"
 AIRFLOW_UID={{ airflow_uid | default(1003) }}
 AIRFLOW_GID={{ deploy_group_gid | default(1001) }}
 # --- S3 Logging Configuration (for Airflow integration) ---
 # Optional: for appending service logs to Airflow's S3 logs.
 # These should match the 'minio_default' connection configured in Airflow.
 S3_ENDPOINT_URL="{{ s3_endpoint_url | default('') }}"
 S3_ACCESS_KEY_ID="{{ vault_s3_access_key_id | default('') }}"
 S3_SECRET_ACCESS_KEY="{{ vault_s3_secret_access_key | default('') }}"
 S3_REGION_NAME="{{ s3_region_name | default('us-east-1') }}"
 # --- Master-specific settings ---
 {% if 'master' in service_role or 'management' in service_role %}
 MASTER_HOST_IP={{ hostvars[groups['airflow_master'][0]].ansible_host }}
--- a/bin/ytops-client
+++ b/bin/ytops-client
@ -0,0 +1,10 @@
 #!/bin/sh
 set -e
 # Find the directory where this script is located.
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 # Go up one level to the project root.
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 # Set PYTHONPATH to include the project root, so we can import 'ytops_client'
 export PYTHONPATH="$PROJECT_ROOT${PYTHONPATH:+:$PYTHONPATH}"
 # Execute the Python CLI script as a module to handle relative imports
 exec python3 -m ytops_client.cli "$@"
--- a/cli.config
+++ b/cli.config
@ -0,0 +1,35 @@
 # yt-dlp configuration for format_download.py
 # Continue on broken downloads
 #--continue
 # Do not simulate
 --no-simulate
 # Do not write info.json file (we already have it)
 --no-write-info-json
 # Continue on download errors
 --ignore-errors
 # Do not download playlist
 --no-playlist
 # Retry fragments 10 times
 --fragment-retries 10
 # Limit download rate to 5M
 --limit-rate 5M
 # Socket timeout
 --socket-timeout 15
 # Sleep interval
 --min-sleep-interval 5
 --max-sleep-interval 10
 # Progress
 --progress
 --no-part
--- a/get_info_json_client.py
+++ b/get_info_json_client.py
@ -1,150 +0,0 @@
 #!/usr/bin/env python3
 """
 Client script to get info.json from the Thrift service.
 Usage:
  python get_info_json_client.py [URL] --host [HOST] --port [PORT] [options]
 Options:
  --host HOST           Thrift server host
  --port PORT           Thrift server port
  --account-id ID       Account ID to use
  --output FILE         Output file path
  --verbose             Enable verbose output
 """
 import argparse
 import json
 import os
 import sys
 import logging
 from typing import Dict, Any, Optional
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger('info_json_client')
 # Import Thrift modules
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from thrift.transport import TTransport
 from pangramia.yt.common.ttypes import TokenUpdateMode
 from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
 from yt_ops_services.client_utils import get_thrift_client
 def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='Get info.json from Thrift service')
    parser.add_argument('url', help='YouTube URL or video ID')
    parser.add_argument('--host', default='127.0.0.1', help="Thrift server host. Using 127.0.0.1 avoids harmless connection errors when the local Envoy proxy only listens on IPv4.")
    parser.add_argument('--port', type=int, default=9080, help='Thrift server port')
    parser.add_argument('--profile', default='default_profile', help='The profile name (accountId) to use for the request.')
    parser.add_argument('--client', help='Specific client to use (e.g., web, ios). Overrides server default. Append "_camoufox" to any client name (e.g., "web_camoufox") to force the browser-based generation strategy.')
    parser.add_argument('--output', help='Output file path for the info.json. If not provided, prints to stdout.')
    parser.add_argument('--machine-id', help='Identifier for the client machine. Defaults to hostname.')
    parser.add_argument('--verbose', action='store_true', help='Enable verbose output')
    return parser.parse_args()
 def main():
    """Main entry point"""
    args = parse_args()
    # Set log level
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    transport = None
    try:
        # Create Thrift client
        client, transport = get_thrift_client(args.host, args.port)
        # Get token data, which includes the info.json
        logger.info(f"Requesting info.json for URL '{args.url}' using profile '{args.profile}'")
        # Prepare arguments for the Thrift call
        machine_id = args.machine_id
        if not machine_id:
            import socket
            machine_id = socket.gethostname()
            logger.info(f"No machine ID provided, using hostname: {machine_id}")
        thrift_args = {
            'accountId': args.profile,
            'updateType': TokenUpdateMode.AUTO,
            'url': args.url,
            'clients': args.client,
            'machineId': machine_id
        }
        if args.client:
            logger.info(f"Requesting to use specific client: {args.client}")
        else:
            logger.info("No specific client requested, server will use its default.")
        token_data = client.getOrRefreshToken(**thrift_args)
        if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson:
            logger.error("Server did not return valid info.json data.")
            print("Error: Server did not return valid info.json data.", file=sys.stderr)
            return 1
        info_json_str = token_data.infoJson
        # Check if the returned info.json is an error report
        try:
            info_data = json.loads(info_json_str)
            if isinstance(info_data, dict) and 'error' in info_data:
                error_code = info_data.get('errorCode', 'N/A')
                error_message = info_data.get('message', info_data.get('error', 'Unknown error'))
                logger.error(f"Server returned an error in info.json (Code: {error_code}): {error_message}")
                print(f"Error from server (Code: {error_code}): {error_message}", file=sys.stderr)
                # Optionally print the full error JSON
                if args.verbose:
                    print(json.dumps(info_data, indent=2), file=sys.stderr)
                return 1
        except json.JSONDecodeError:
            logger.error(f"Failed to parse info.json from server: {info_json_str[:200]}...")
            print("Error: Failed to parse the info.json response from the server.", file=sys.stderr)
            return 1
        logger.info(f"Successfully retrieved info.json ({len(info_json_str)} bytes)")
        # Write to output file if specified, otherwise print to stdout
        if args.output:
            try:
                with open(args.output, 'w', encoding='utf-8') as f:
                    # Pretty-print the JSON to the file
                    json.dump(info_data, f, indent=2)
                logger.info(f"Wrote info.json to {args.output}")
                print(f"Successfully saved info.json to {args.output}")
            except IOError as e:
                logger.error(f"Failed to write to output file {args.output}: {e}")
                print(f"Error: Failed to write to output file {args.output}: {e}", file=sys.stderr)
                return 1
        else:
            # Pretty-print the JSON to stdout
            print(json.dumps(info_data, indent=2))
        return 0
    except (PBServiceException, PBUserException) as e:
        logger.error(f"A Thrift error occurred: {e.message}", exc_info=args.verbose)
        print(f"Error: {e.message}", file=sys.stderr)
        if hasattr(e, 'context') and e.context:
            print(f"Context: {e.context}", file=sys.stderr)
        return 1
    except TTransport.TTransportException as e:
        logger.error(f"Connection to server failed: {e}", exc_info=args.verbose)
        print(f"Error: Connection to server at {args.host}:{args.port} failed.", file=sys.stderr)
        return 1
    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
        return 1
    finally:
        if transport and transport.isOpen():
            transport.close()
            logger.info("Thrift connection closed.")
 if __name__ == "__main__":
    sys.exit(main())
--- a/package_client.py
+++ b/package_client.py
@ -0,0 +1,117 @@
 #!/usr/bin/env python3
 """
 Packages the client-side scripts and their dependencies into a distributable .tar.gz archive.
 This script should be run from the root of the project repository.
 """
 import argparse
 import os
 import shutil
 import sys
 import tarfile
 from pathlib import Path
 try:
    # Assumes yt_ops_services/version.py exists and is importable
    from yt_ops_services.version import get_version as get_api_version
 except ImportError:
    print("Error: Could not import get_version from yt_ops_services.version.", file=sys.stderr)
    print("Please ensure yt_ops_services/version.py exists and run this script from the project root.", file=sys.stderr)
    sys.exit(1)
 def get_client_version():
    """Reads the client version from the VERSION.client file."""
    try:
        return Path('VERSION.client').read_text(encoding='utf-8').strip()
    except FileNotFoundError:
        print("Error: VERSION.client file not found in the project root.", file=sys.stderr)
        sys.exit(1)
 # --- Configuration ---
 # Defines the content of the package.
 # Keys are source paths relative to the project root.
 # Values are destination paths inside the archive.
 PACKAGE_CONTENT = {
    'get_info_json_client.py': 'get_info_json_client.py',
    'list_formats.py': 'list_formats.py',
    'format_download.py': 'format_download.py',
    'stress_test_formats.py': 'stress_test_formats.py',
    'cli.config': 'cli.config',
    'README.client.md': 'README.md',  # Rename for convention
    'formats.md': 'formats.md',
    'VERSION.client': 'VERSION.client',
    'yt_ops_services': 'yt_ops_services',
    'thrift_model/gen_py': 'thrift_model/gen_py',
 }
 # Client-side Python requirements
 CLIENT_REQUIREMENTS = [
    'thrift==0.16.0',
 ]
 def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Package the yt-ops-services client tools.")
    parser.add_argument('--output-dir', default='dist', help='Directory to save the package file (default: dist).')
    args = parser.parse_args()
    api_version = get_api_version()
    client_version = get_client_version()
    package_name = f"yt-ops-services-client-{api_version}-{client_version}"
    archive_filename = f"{package_name}.tar.gz"
    os.makedirs(args.output_dir, exist_ok=True)
    archive_path = os.path.join(args.output_dir, archive_filename)
    staging_dir = Path(args.output_dir) / f"{package_name}-staging"
    print(f"Creating client package: {archive_filename}")
    if staging_dir.exists():
        shutil.rmtree(staging_dir)
    staging_dir.mkdir(parents=True)
    package_root = staging_dir / package_name
    package_root.mkdir()
    try:
        print("Staging files...")
        for src, dest in PACKAGE_CONTENT.items():
            src_path = Path(src)
            dest_path = package_root / dest
            if not src_path.exists():
                print(f"Warning: Source not found, skipping: {src_path}", file=sys.stderr)
                continue
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            if src_path.is_dir():
                shutil.copytree(src_path, dest_path)
            else:
                shutil.copy2(src_path, dest_path)
        # Create __init__.py to ensure thrift_model is a package
        (package_root / 'thrift_model/__init__.py').touch()
        print("Creating requirements.txt...")
        (package_root / 'requirements.txt').write_text('\n'.join(CLIENT_REQUIREMENTS) + '\n', encoding='utf-8')
        print(f"Creating archive at {archive_path}...")
        with tarfile.open(archive_path, "w:gz") as tar:
            tar.add(package_root, arcname=package_name)
        print("\nPackage created successfully!")
        print(f" -> {archive_path}")
    finally:
        if staging_dir.exists():
            print("Cleaning up staging directory...")
            shutil.rmtree(staging_dir)
 if __name__ == "__main__":
    main()
--- a/pangramia/init.py
+++ b/pangramia/init.py
--- a/pangramia/pycache/init.cpython-39.pyc
+++ b/pangramia/pycache/init.cpython-39.pyc
--- a/pangramia/base_service/BaseService-remote
+++ b/pangramia/base_service/BaseService-remote
@ -0,0 +1,131 @@
 #!/usr/bin/env python
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 import sys
 import pprint
 if sys.version_info[0] > 2:
    from urllib.parse import urlparse
 else:
    from urlparse import urlparse
 from thrift.transport import TTransport, TSocket, TSSLSocket, THttpClient
 from thrift.protocol.TBinaryProtocol import TBinaryProtocol
 from pangramia.base_service import BaseService
 from pangramia.base_service.ttypes import *
 if len(sys.argv) <= 1 or sys.argv[1] == '--help':
    print('')
    print('Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] [-s[sl]] [-novalidate] [-ca_certs certs] [-keyfile keyfile] [-certfile certfile] function [arg1 [arg2...]]')
    print('')
    print('Functions:')
    print('  bool ping()')
    print('  bool reportError(string message,  details)')
    print('  void shutdown()')
    print('')
    sys.exit(0)
 pp = pprint.PrettyPrinter(indent=2)
 host = 'localhost'
 port = 9090
 uri = ''
 framed = False
 ssl = False
 validate = True
 ca_certs = None
 keyfile = None
 certfile = None
 http = False
 argi = 1
 if sys.argv[argi] == '-h':
    parts = sys.argv[argi + 1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    argi += 2
 if sys.argv[argi] == '-u':
    url = urlparse(sys.argv[argi + 1])
    parts = url[1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    else:
        port = 80
    uri = url[2]
    if url[4]:
        uri += '?%s' % url[4]
    http = True
    argi += 2
 if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
    framed = True
    argi += 1
 if sys.argv[argi] == '-s' or sys.argv[argi] == '-ssl':
    ssl = True
    argi += 1
 if sys.argv[argi] == '-novalidate':
    validate = False
    argi += 1
 if sys.argv[argi] == '-ca_certs':
    ca_certs = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-keyfile':
    keyfile = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-certfile':
    certfile = sys.argv[argi+1]
    argi += 2
 cmd = sys.argv[argi]
 args = sys.argv[argi + 1:]
 if http:
    transport = THttpClient.THttpClient(host, port, uri)
 else:
    if ssl:
        socket = TSSLSocket.TSSLSocket(host, port, validate=validate, ca_certs=ca_certs, keyfile=keyfile, certfile=certfile)
    else:
        socket = TSocket.TSocket(host, port)
    if framed:
        transport = TTransport.TFramedTransport(socket)
    else:
        transport = TTransport.TBufferedTransport(socket)
 protocol = TBinaryProtocol(transport)
 client = BaseService.Client(protocol)
 transport.open()
 if cmd == 'ping':
    if len(args) != 0:
        print('ping requires 0 args')
        sys.exit(1)
    pp.pprint(client.ping())
 elif cmd == 'reportError':
    if len(args) != 2:
        print('reportError requires 2 args')
        sys.exit(1)
    pp.pprint(client.reportError(args[0], eval(args[1]),))
 elif cmd == 'shutdown':
    if len(args) != 0:
        print('shutdown requires 0 args')
        sys.exit(1)
    pp.pprint(client.shutdown())
 else:
    print('Unrecognized method %s' % cmd)
    sys.exit(1)
 transport.close()
--- a/pangramia/base_service/BaseService.py
+++ b/pangramia/base_service/BaseService.py
@ -0,0 +1,564 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 import logging
 from .ttypes import *
 from thrift.Thrift import TProcessor
 from thrift.transport import TTransport
 all_structs = []
 class Iface(object):
    def ping(self):
        pass
    def reportError(self, message, details):
        """
        Parameters:
         - message
         - details
        """
        pass
    def shutdown(self):
        pass
 class Client(Iface):
    def __init__(self, iprot, oprot=None):
        self._iprot = self._oprot = iprot
        if oprot is not None:
            self._oprot = oprot
        self._seqid = 0
    def ping(self):
        self.send_ping()
        return self.recv_ping()
    def send_ping(self):
        self._oprot.writeMessageBegin('ping', TMessageType.CALL, self._seqid)
        args = ping_args()
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
    def recv_ping(self):
        iprot = self._iprot
        (fname, mtype, rseqid) = iprot.readMessageBegin()
        if mtype == TMessageType.EXCEPTION:
            x = TApplicationException()
            x.read(iprot)
            iprot.readMessageEnd()
            raise x
        result = ping_result()
        result.read(iprot)
        iprot.readMessageEnd()
        if result.success is not None:
            return result.success
        if result.serviceExp is not None:
            raise result.serviceExp
        if result.userExp is not None:
            raise result.userExp
        raise TApplicationException(TApplicationException.MISSING_RESULT, "ping failed: unknown result")
    def reportError(self, message, details):
        """
        Parameters:
         - message
         - details
        """
        self.send_reportError(message, details)
        return self.recv_reportError()
    def send_reportError(self, message, details):
        self._oprot.writeMessageBegin('reportError', TMessageType.CALL, self._seqid)
        args = reportError_args()
        args.message = message
        args.details = details
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
    def recv_reportError(self):
        iprot = self._iprot
        (fname, mtype, rseqid) = iprot.readMessageBegin()
        if mtype == TMessageType.EXCEPTION:
            x = TApplicationException()
            x.read(iprot)
            iprot.readMessageEnd()
            raise x
        result = reportError_result()
        result.read(iprot)
        iprot.readMessageEnd()
        if result.success is not None:
            return result.success
        if result.serviceExp is not None:
            raise result.serviceExp
        if result.userExp is not None:
            raise result.userExp
        raise TApplicationException(TApplicationException.MISSING_RESULT, "reportError failed: unknown result")
    def shutdown(self):
        self.send_shutdown()
    def send_shutdown(self):
        self._oprot.writeMessageBegin('shutdown', TMessageType.ONEWAY, self._seqid)
        args = shutdown_args()
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
 class Processor(Iface, TProcessor):
    def __init__(self, handler):
        self._handler = handler
        self._processMap = {}
        self._processMap["ping"] = Processor.process_ping
        self._processMap["reportError"] = Processor.process_reportError
        self._processMap["shutdown"] = Processor.process_shutdown
        self._on_message_begin = None
    def on_message_begin(self, func):
        self._on_message_begin = func
    def process(self, iprot, oprot):
        (name, type, seqid) = iprot.readMessageBegin()
        if self._on_message_begin:
            self._on_message_begin(name, type, seqid)
        if name not in self._processMap:
            iprot.skip(TType.STRUCT)
            iprot.readMessageEnd()
            x = TApplicationException(TApplicationException.UNKNOWN_METHOD, 'Unknown function %s' % (name))
            oprot.writeMessageBegin(name, TMessageType.EXCEPTION, seqid)
            x.write(oprot)
            oprot.writeMessageEnd()
            oprot.trans.flush()
            return
        else:
            self._processMap[name](self, seqid, iprot, oprot)
        return True
    def process_ping(self, seqid, iprot, oprot):
        args = ping_args()
        args.read(iprot)
        iprot.readMessageEnd()
        result = ping_result()
        try:
            result.success = self._handler.ping()
            msg_type = TMessageType.REPLY
        except TTransport.TTransportException:
            raise
        except pangramia.yt.exceptions.ttypes.PBServiceException as serviceExp:
            msg_type = TMessageType.REPLY
            result.serviceExp = serviceExp
        except pangramia.yt.exceptions.ttypes.PBUserException as userExp:
            msg_type = TMessageType.REPLY
            result.userExp = userExp
        except TApplicationException as ex:
            logging.exception('TApplication exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = ex
        except Exception:
            logging.exception('Unexpected exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error')
        oprot.writeMessageBegin("ping", msg_type, seqid)
        result.write(oprot)
        oprot.writeMessageEnd()
        oprot.trans.flush()
    def process_reportError(self, seqid, iprot, oprot):
        args = reportError_args()
        args.read(iprot)
        iprot.readMessageEnd()
        result = reportError_result()
        try:
            result.success = self._handler.reportError(args.message, args.details)
            msg_type = TMessageType.REPLY
        except TTransport.TTransportException:
            raise
        except pangramia.yt.exceptions.ttypes.PBServiceException as serviceExp:
            msg_type = TMessageType.REPLY
            result.serviceExp = serviceExp
        except pangramia.yt.exceptions.ttypes.PBUserException as userExp:
            msg_type = TMessageType.REPLY
            result.userExp = userExp
        except TApplicationException as ex:
            logging.exception('TApplication exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = ex
        except Exception:
            logging.exception('Unexpected exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error')
        oprot.writeMessageBegin("reportError", msg_type, seqid)
        result.write(oprot)
        oprot.writeMessageEnd()
        oprot.trans.flush()
    def process_shutdown(self, seqid, iprot, oprot):
        args = shutdown_args()
        args.read(iprot)
        iprot.readMessageEnd()
        try:
            self._handler.shutdown()
        except TTransport.TTransportException:
            raise
        except Exception:
            logging.exception('Exception in oneway handler')
 # HELPER FUNCTIONS AND STRUCTURES
 class ping_args(object):
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('ping_args')
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(ping_args)
 ping_args.thrift_spec = (
 )
 class ping_result(object):
    """
    Attributes:
     - success
     - serviceExp
     - userExp
    """
    def __init__(self, success=None, serviceExp=None, userExp=None,):
        self.success = success
        self.serviceExp = serviceExp
        self.userExp = userExp
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 0:
                if ftype == TType.BOOL:
                    self.success = iprot.readBool()
                else:
                    iprot.skip(ftype)
            elif fid == 1:
                if ftype == TType.STRUCT:
                    self.serviceExp = pangramia.yt.exceptions.ttypes.PBServiceException.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRUCT:
                    self.userExp = pangramia.yt.exceptions.ttypes.PBUserException.read(iprot)
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('ping_result')
        if self.success is not None:
            oprot.writeFieldBegin('success', TType.BOOL, 0)
            oprot.writeBool(self.success)
            oprot.writeFieldEnd()
        if self.serviceExp is not None:
            oprot.writeFieldBegin('serviceExp', TType.STRUCT, 1)
            self.serviceExp.write(oprot)
            oprot.writeFieldEnd()
        if self.userExp is not None:
            oprot.writeFieldBegin('userExp', TType.STRUCT, 2)
            self.userExp.write(oprot)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(ping_result)
 ping_result.thrift_spec = (
    (0, TType.BOOL, 'success', None, None, ),  # 0
    (1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ),  # 1
    (2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ),  # 2
 )
 class reportError_args(object):
    """
    Attributes:
     - message
     - details
    """
    def __init__(self, message=None, details=None,):
        self.message = message
        self.details = details
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 1:
                if ftype == TType.STRING:
                    self.message = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.MAP:
                    self.details = {}
                    (_ktype1, _vtype2, _size0) = iprot.readMapBegin()
                    for _i4 in range(_size0):
                        _key5 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        _val6 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        self.details[_key5] = _val6
                    iprot.readMapEnd()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('reportError_args')
        if self.message is not None:
            oprot.writeFieldBegin('message', TType.STRING, 1)
            oprot.writeString(self.message.encode('utf-8') if sys.version_info[0] == 2 else self.message)
            oprot.writeFieldEnd()
        if self.details is not None:
            oprot.writeFieldBegin('details', TType.MAP, 2)
            oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.details))
            for kiter7, viter8 in self.details.items():
                oprot.writeString(kiter7.encode('utf-8') if sys.version_info[0] == 2 else kiter7)
                oprot.writeString(viter8.encode('utf-8') if sys.version_info[0] == 2 else viter8)
            oprot.writeMapEnd()
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(reportError_args)
 reportError_args.thrift_spec = (
    None,  # 0
    (1, TType.STRING, 'message', 'UTF8', None, ),  # 1
    (2, TType.MAP, 'details', (TType.STRING, 'UTF8', TType.STRING, 'UTF8', False), None, ),  # 2
 )
 class reportError_result(object):
    """
    Attributes:
     - success
     - serviceExp
     - userExp
    """
    def __init__(self, success=None, serviceExp=None, userExp=None,):
        self.success = success
        self.serviceExp = serviceExp
        self.userExp = userExp
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 0:
                if ftype == TType.BOOL:
                    self.success = iprot.readBool()
                else:
                    iprot.skip(ftype)
            elif fid == 1:
                if ftype == TType.STRUCT:
                    self.serviceExp = pangramia.yt.exceptions.ttypes.PBServiceException.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRUCT:
                    self.userExp = pangramia.yt.exceptions.ttypes.PBUserException.read(iprot)
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('reportError_result')
        if self.success is not None:
            oprot.writeFieldBegin('success', TType.BOOL, 0)
            oprot.writeBool(self.success)
            oprot.writeFieldEnd()
        if self.serviceExp is not None:
            oprot.writeFieldBegin('serviceExp', TType.STRUCT, 1)
            self.serviceExp.write(oprot)
            oprot.writeFieldEnd()
        if self.userExp is not None:
            oprot.writeFieldBegin('userExp', TType.STRUCT, 2)
            self.userExp.write(oprot)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(reportError_result)
 reportError_result.thrift_spec = (
    (0, TType.BOOL, 'success', None, None, ),  # 0
    (1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ),  # 1
    (2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ),  # 2
 )
 class shutdown_args(object):
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('shutdown_args')
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(shutdown_args)
 shutdown_args.thrift_spec = (
 )
 fix_spec(all_structs)
 del all_structs
--- a/pangramia/base_service/init.py
+++ b/pangramia/base_service/init.py
@ -0,0 +1 @@
 __all__ = ['ttypes', 'constants', 'BaseService']
--- a/pangramia/base_service/constants.py
+++ b/pangramia/base_service/constants.py
@ -0,0 +1,14 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from .ttypes import *
--- a/pangramia/base_service/ttypes.py
+++ b/pangramia/base_service/ttypes.py
@ -0,0 +1,20 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 import pangramia.yt.common.ttypes
 import pangramia.yt.exceptions.ttypes
 from thrift.transport import TTransport
 all_structs = []
 fix_spec(all_structs)
 del all_structs
--- a/pangramia/yt/init.py
+++ b/pangramia/yt/init.py
--- a/pangramia/yt/common/init.py
+++ b/pangramia/yt/common/init.py
@ -0,0 +1 @@
 __all__ = ['ttypes', 'constants']
--- a/pangramia/yt/common/constants.py
+++ b/pangramia/yt/common/constants.py
@ -0,0 +1,14 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from .ttypes import *
--- a/pangramia/yt/common/ttypes.py
+++ b/pangramia/yt/common/ttypes.py
--- a/pangramia/yt/exceptions/init.py
+++ b/pangramia/yt/exceptions/init.py
@ -0,0 +1 @@
 __all__ = ['ttypes', 'constants']
--- a/pangramia/yt/exceptions/pycache/init.cpython-39.pyc
+++ b/pangramia/yt/exceptions/pycache/init.cpython-39.pyc
--- a/pangramia/yt/exceptions/pycache/ttypes.cpython-39.pyc
+++ b/pangramia/yt/exceptions/pycache/ttypes.cpython-39.pyc
--- a/pangramia/yt/exceptions/constants.py
+++ b/pangramia/yt/exceptions/constants.py
@ -0,0 +1,14 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from .ttypes import *
--- a/pangramia/yt/exceptions/ttypes.py
+++ b/pangramia/yt/exceptions/ttypes.py
@ -0,0 +1,254 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from thrift.transport import TTransport
 all_structs = []
 class PBServiceException(TException):
    """
    Attributes:
     - message
     - errorCode
     - context
    """
    def __init__(self, message=None, errorCode=None, context=None,):
        super(PBServiceException, self).__setattr__('message', message)
        super(PBServiceException, self).__setattr__('errorCode', errorCode)
        super(PBServiceException, self).__setattr__('context', context)
    def __setattr__(self, *args):
        raise TypeError("can't modify immutable instance")
    def __delattr__(self, *args):
        raise TypeError("can't modify immutable instance")
    def __hash__(self):
        return hash(self.__class__) ^ hash((self.message, self.errorCode, self.context, ))
    @classmethod
    def read(cls, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and cls.thrift_spec is not None:
            return iprot._fast_decode(None, iprot, [cls, cls.thrift_spec])
        iprot.readStructBegin()
        message = None
        errorCode = None
        context = None
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 1:
                if ftype == TType.STRING:
                    message = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRING:
                    errorCode = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 3:
                if ftype == TType.MAP:
                    context = {}
                    (_ktype1, _vtype2, _size0) = iprot.readMapBegin()
                    for _i4 in range(_size0):
                        _key5 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        _val6 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        context[_key5] = _val6
                    iprot.readMapEnd()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
        return cls(
            message=message,
            errorCode=errorCode,
            context=context,
        )
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('PBServiceException')
        if self.message is not None:
            oprot.writeFieldBegin('message', TType.STRING, 1)
            oprot.writeString(self.message.encode('utf-8') if sys.version_info[0] == 2 else self.message)
            oprot.writeFieldEnd()
        if self.errorCode is not None:
            oprot.writeFieldBegin('errorCode', TType.STRING, 2)
            oprot.writeString(self.errorCode.encode('utf-8') if sys.version_info[0] == 2 else self.errorCode)
            oprot.writeFieldEnd()
        if self.context is not None:
            oprot.writeFieldBegin('context', TType.MAP, 3)
            oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.context))
            for kiter7, viter8 in self.context.items():
                oprot.writeString(kiter7.encode('utf-8') if sys.version_info[0] == 2 else kiter7)
                oprot.writeString(viter8.encode('utf-8') if sys.version_info[0] == 2 else viter8)
            oprot.writeMapEnd()
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        if self.message is None:
            raise TProtocolException(message='Required field message is unset!')
        return
    def __str__(self):
        return repr(self)
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 class PBUserException(TException):
    """
    Attributes:
     - message
     - errorCode
     - context
    """
    def __init__(self, message=None, errorCode=None, context=None,):
        super(PBUserException, self).__setattr__('message', message)
        super(PBUserException, self).__setattr__('errorCode', errorCode)
        super(PBUserException, self).__setattr__('context', context)
    def __setattr__(self, *args):
        raise TypeError("can't modify immutable instance")
    def __delattr__(self, *args):
        raise TypeError("can't modify immutable instance")
    def __hash__(self):
        return hash(self.__class__) ^ hash((self.message, self.errorCode, self.context, ))
    @classmethod
    def read(cls, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and cls.thrift_spec is not None:
            return iprot._fast_decode(None, iprot, [cls, cls.thrift_spec])
        iprot.readStructBegin()
        message = None
        errorCode = None
        context = None
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 1:
                if ftype == TType.STRING:
                    message = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRING:
                    errorCode = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 3:
                if ftype == TType.MAP:
                    context = {}
                    (_ktype10, _vtype11, _size9) = iprot.readMapBegin()
                    for _i13 in range(_size9):
                        _key14 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        _val15 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        context[_key14] = _val15
                    iprot.readMapEnd()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
        return cls(
            message=message,
            errorCode=errorCode,
            context=context,
        )
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('PBUserException')
        if self.message is not None:
            oprot.writeFieldBegin('message', TType.STRING, 1)
            oprot.writeString(self.message.encode('utf-8') if sys.version_info[0] == 2 else self.message)
            oprot.writeFieldEnd()
        if self.errorCode is not None:
            oprot.writeFieldBegin('errorCode', TType.STRING, 2)
            oprot.writeString(self.errorCode.encode('utf-8') if sys.version_info[0] == 2 else self.errorCode)
            oprot.writeFieldEnd()
        if self.context is not None:
            oprot.writeFieldBegin('context', TType.MAP, 3)
            oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.context))
            for kiter16, viter17 in self.context.items():
                oprot.writeString(kiter16.encode('utf-8') if sys.version_info[0] == 2 else kiter16)
                oprot.writeString(viter17.encode('utf-8') if sys.version_info[0] == 2 else viter17)
            oprot.writeMapEnd()
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        if self.message is None:
            raise TProtocolException(message='Required field message is unset!')
        return
    def __str__(self):
        return repr(self)
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(PBServiceException)
 PBServiceException.thrift_spec = (
    None,  # 0
    (1, TType.STRING, 'message', 'UTF8', None, ),  # 1
    (2, TType.STRING, 'errorCode', 'UTF8', None, ),  # 2
    (3, TType.MAP, 'context', (TType.STRING, 'UTF8', TType.STRING, 'UTF8', False), None, ),  # 3
 )
 all_structs.append(PBUserException)
 PBUserException.thrift_spec = (
    None,  # 0
    (1, TType.STRING, 'message', 'UTF8', None, ),  # 1
    (2, TType.STRING, 'errorCode', 'UTF8', None, ),  # 2
    (3, TType.MAP, 'context', (TType.STRING, 'UTF8', TType.STRING, 'UTF8', False), None, ),  # 3
 )
 fix_spec(all_structs)
 del all_structs
--- a/pangramia/yt/management/YTManagementService-remote
+++ b/pangramia/yt/management/YTManagementService-remote
@ -0,0 +1,215 @@
 #!/usr/bin/env python
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 import sys
 import pprint
 if sys.version_info[0] > 2:
    from urllib.parse import urlparse
 else:
    from urlparse import urlparse
 from thrift.transport import TTransport, TSocket, TSSLSocket, THttpClient
 from thrift.protocol.TBinaryProtocol import TBinaryProtocol
 from pangramia.yt.management import YTManagementService
 from pangramia.yt.management.ttypes import *
 if len(sys.argv) <= 1 or sys.argv[1] == '--help':
    print('')
    print('Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] [-s[sl]] [-novalidate] [-ca_certs certs] [-keyfile keyfile] [-certfile certfile] function [arg1 [arg2...]]')
    print('')
    print('Functions:')
    print('   getProxyStatus(string serverIdentity)')
    print('  bool banProxy(string proxyUrl, string serverIdentity)')
    print('  bool unbanProxy(string proxyUrl, string serverIdentity)')
    print('  bool resetAllProxyStatuses(string serverIdentity)')
    print('  bool banAllProxies(string serverIdentity)')
    print('  bool deleteProxyFromRedis(string proxyUrl, string serverIdentity)')
    print('  i32 deleteAllProxiesFromRedis(string serverIdentity)')
    print('   getAccountStatus(string accountId, string accountPrefix)')
    print('  bool banAccount(string accountId, string reason)')
    print('  bool unbanAccount(string accountId, string reason)')
    print('  bool deleteAccountFromRedis(string accountId)')
    print('  i32 deleteAllAccountsFromRedis(string accountPrefix)')
    print('  bool ping()')
    print('  bool reportError(string message,  details)')
    print('  void shutdown()')
    print('')
    sys.exit(0)
 pp = pprint.PrettyPrinter(indent=2)
 host = 'localhost'
 port = 9090
 uri = ''
 framed = False
 ssl = False
 validate = True
 ca_certs = None
 keyfile = None
 certfile = None
 http = False
 argi = 1
 if sys.argv[argi] == '-h':
    parts = sys.argv[argi + 1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    argi += 2
 if sys.argv[argi] == '-u':
    url = urlparse(sys.argv[argi + 1])
    parts = url[1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    else:
        port = 80
    uri = url[2]
    if url[4]:
        uri += '?%s' % url[4]
    http = True
    argi += 2
 if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
    framed = True
    argi += 1
 if sys.argv[argi] == '-s' or sys.argv[argi] == '-ssl':
    ssl = True
    argi += 1
 if sys.argv[argi] == '-novalidate':
    validate = False
    argi += 1
 if sys.argv[argi] == '-ca_certs':
    ca_certs = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-keyfile':
    keyfile = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-certfile':
    certfile = sys.argv[argi+1]
    argi += 2
 cmd = sys.argv[argi]
 args = sys.argv[argi + 1:]
 if http:
    transport = THttpClient.THttpClient(host, port, uri)
 else:
    if ssl:
        socket = TSSLSocket.TSSLSocket(host, port, validate=validate, ca_certs=ca_certs, keyfile=keyfile, certfile=certfile)
    else:
        socket = TSocket.TSocket(host, port)
    if framed:
        transport = TTransport.TFramedTransport(socket)
    else:
        transport = TTransport.TBufferedTransport(socket)
 protocol = TBinaryProtocol(transport)
 client = YTManagementService.Client(protocol)
 transport.open()
 if cmd == 'getProxyStatus':
    if len(args) != 1:
        print('getProxyStatus requires 1 args')
        sys.exit(1)
    pp.pprint(client.getProxyStatus(args[0],))
 elif cmd == 'banProxy':
    if len(args) != 2:
        print('banProxy requires 2 args')
        sys.exit(1)
    pp.pprint(client.banProxy(args[0], args[1],))
 elif cmd == 'unbanProxy':
    if len(args) != 2:
        print('unbanProxy requires 2 args')
        sys.exit(1)
    pp.pprint(client.unbanProxy(args[0], args[1],))
 elif cmd == 'resetAllProxyStatuses':
    if len(args) != 1:
        print('resetAllProxyStatuses requires 1 args')
        sys.exit(1)
    pp.pprint(client.resetAllProxyStatuses(args[0],))
 elif cmd == 'banAllProxies':
    if len(args) != 1:
        print('banAllProxies requires 1 args')
        sys.exit(1)
    pp.pprint(client.banAllProxies(args[0],))
 elif cmd == 'deleteProxyFromRedis':
    if len(args) != 2:
        print('deleteProxyFromRedis requires 2 args')
        sys.exit(1)
    pp.pprint(client.deleteProxyFromRedis(args[0], args[1],))
 elif cmd == 'deleteAllProxiesFromRedis':
    if len(args) != 1:
        print('deleteAllProxiesFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAllProxiesFromRedis(args[0],))
 elif cmd == 'getAccountStatus':
    if len(args) != 2:
        print('getAccountStatus requires 2 args')
        sys.exit(1)
    pp.pprint(client.getAccountStatus(args[0], args[1],))
 elif cmd == 'banAccount':
    if len(args) != 2:
        print('banAccount requires 2 args')
        sys.exit(1)
    pp.pprint(client.banAccount(args[0], args[1],))
 elif cmd == 'unbanAccount':
    if len(args) != 2:
        print('unbanAccount requires 2 args')
        sys.exit(1)
    pp.pprint(client.unbanAccount(args[0], args[1],))
 elif cmd == 'deleteAccountFromRedis':
    if len(args) != 1:
        print('deleteAccountFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAccountFromRedis(args[0],))
 elif cmd == 'deleteAllAccountsFromRedis':
    if len(args) != 1:
        print('deleteAllAccountsFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAllAccountsFromRedis(args[0],))
 elif cmd == 'ping':
    if len(args) != 0:
        print('ping requires 0 args')
        sys.exit(1)
    pp.pprint(client.ping())
 elif cmd == 'reportError':
    if len(args) != 2:
        print('reportError requires 2 args')
        sys.exit(1)
    pp.pprint(client.reportError(args[0], eval(args[1]),))
 elif cmd == 'shutdown':
    if len(args) != 0:
        print('shutdown requires 0 args')
        sys.exit(1)
    pp.pprint(client.shutdown())
 else:
    print('Unrecognized method %s' % cmd)
    sys.exit(1)
 transport.close()
--- a/pangramia/yt/management/YTManagementService.py
+++ b/pangramia/yt/management/YTManagementService.py
--- a/pangramia/yt/management/init.py
+++ b/pangramia/yt/management/init.py
@ -0,0 +1 @@
 __all__ = ['ttypes', 'constants', 'YTManagementService']
--- a/pangramia/yt/management/constants.py
+++ b/pangramia/yt/management/constants.py
@ -0,0 +1,14 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from .ttypes import *
--- a/pangramia/yt/management/ttypes.py
+++ b/pangramia/yt/management/ttypes.py
@ -0,0 +1,21 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 import pangramia.yt.common.ttypes
 import pangramia.yt.exceptions.ttypes
 import pangramia.base_service.ttypes
 from thrift.transport import TTransport
 all_structs = []
 fix_spec(all_structs)
 del all_structs
--- a/pangramia/yt/tokens_ops/YTTokenOpService-remote
+++ b/pangramia/yt/tokens_ops/YTTokenOpService-remote
@ -0,0 +1,257 @@
 #!/usr/bin/env python
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 import sys
 import pprint
 if sys.version_info[0] > 2:
    from urllib.parse import urlparse
 else:
    from urlparse import urlparse
 from thrift.transport import TTransport, TSocket, TSSLSocket, THttpClient
 from thrift.protocol.TBinaryProtocol import TBinaryProtocol
 from pangramia.yt.tokens_ops import YTTokenOpService
 from pangramia.yt.tokens_ops.ttypes import *
 if len(sys.argv) <= 1 or sys.argv[1] == '--help':
    print('')
    print('Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] [-s[sl]] [-novalidate] [-ca_certs certs] [-keyfile keyfile] [-certfile certfile] function [arg1 [arg2...]]')
    print('')
    print('Functions:')
    print('  JobTokenData getOrRefreshTokenWithReport(string accountId, string oldUrl, JobState status, string details, string jobId, TokenUpdateMode updateType, string url, string clients, AirflowLogContext airflowLogContext, string requestParamsJson)')
    print('  JobTokenData getOrRefreshToken(string accountId, TokenUpdateMode updateType, string url, string clients, string machineId, AirflowLogContext airflowLogContext, string requestParamsJson, string assignedProxyUrl)')
    print('  JobTokenData getLatestToken(string accountId)')
    print('  JobTokenData refreshToken(string accountId, TokenUpdateMode updateType, string url)')
    print('  bool reportState(string url, JobState status, string details, string jobId)')
    print('  JobTokenData getInfoJsonDirect(string url, string clients)')
    print('   getProxyStatus(string serverIdentity)')
    print('  bool banProxy(string proxyUrl, string serverIdentity)')
    print('  bool unbanProxy(string proxyUrl, string serverIdentity)')
    print('  bool resetAllProxyStatuses(string serverIdentity)')
    print('  bool banAllProxies(string serverIdentity)')
    print('  bool deleteProxyFromRedis(string proxyUrl, string serverIdentity)')
    print('  i32 deleteAllProxiesFromRedis(string serverIdentity)')
    print('   getAccountStatus(string accountId, string accountPrefix)')
    print('  bool banAccount(string accountId, string reason)')
    print('  bool unbanAccount(string accountId, string reason)')
    print('  bool deleteAccountFromRedis(string accountId)')
    print('  i32 deleteAllAccountsFromRedis(string accountPrefix)')
    print('  bool ping()')
    print('  bool reportError(string message,  details)')
    print('  void shutdown()')
    print('')
    sys.exit(0)
 pp = pprint.PrettyPrinter(indent=2)
 host = 'localhost'
 port = 9090
 uri = ''
 framed = False
 ssl = False
 validate = True
 ca_certs = None
 keyfile = None
 certfile = None
 http = False
 argi = 1
 if sys.argv[argi] == '-h':
    parts = sys.argv[argi + 1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    argi += 2
 if sys.argv[argi] == '-u':
    url = urlparse(sys.argv[argi + 1])
    parts = url[1].split(':')
    host = parts[0]
    if len(parts) > 1:
        port = int(parts[1])
    else:
        port = 80
    uri = url[2]
    if url[4]:
        uri += '?%s' % url[4]
    http = True
    argi += 2
 if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
    framed = True
    argi += 1
 if sys.argv[argi] == '-s' or sys.argv[argi] == '-ssl':
    ssl = True
    argi += 1
 if sys.argv[argi] == '-novalidate':
    validate = False
    argi += 1
 if sys.argv[argi] == '-ca_certs':
    ca_certs = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-keyfile':
    keyfile = sys.argv[argi+1]
    argi += 2
 if sys.argv[argi] == '-certfile':
    certfile = sys.argv[argi+1]
    argi += 2
 cmd = sys.argv[argi]
 args = sys.argv[argi + 1:]
 if http:
    transport = THttpClient.THttpClient(host, port, uri)
 else:
    if ssl:
        socket = TSSLSocket.TSSLSocket(host, port, validate=validate, ca_certs=ca_certs, keyfile=keyfile, certfile=certfile)
    else:
        socket = TSocket.TSocket(host, port)
    if framed:
        transport = TTransport.TFramedTransport(socket)
    else:
        transport = TTransport.TBufferedTransport(socket)
 protocol = TBinaryProtocol(transport)
 client = YTTokenOpService.Client(protocol)
 transport.open()
 if cmd == 'getOrRefreshTokenWithReport':
    if len(args) != 10:
        print('getOrRefreshTokenWithReport requires 10 args')
        sys.exit(1)
    pp.pprint(client.getOrRefreshTokenWithReport(args[0], args[1], eval(args[2]), args[3], args[4], eval(args[5]), args[6], args[7], eval(args[8]), args[9],))
 elif cmd == 'getOrRefreshToken':
    if len(args) != 8:
        print('getOrRefreshToken requires 8 args')
        sys.exit(1)
    pp.pprint(client.getOrRefreshToken(args[0], eval(args[1]), args[2], args[3], args[4], eval(args[5]), args[6], args[7],))
 elif cmd == 'getLatestToken':
    if len(args) != 1:
        print('getLatestToken requires 1 args')
        sys.exit(1)
    pp.pprint(client.getLatestToken(args[0],))
 elif cmd == 'refreshToken':
    if len(args) != 3:
        print('refreshToken requires 3 args')
        sys.exit(1)
    pp.pprint(client.refreshToken(args[0], eval(args[1]), args[2],))
 elif cmd == 'reportState':
    if len(args) != 4:
        print('reportState requires 4 args')
        sys.exit(1)
    pp.pprint(client.reportState(args[0], eval(args[1]), args[2], args[3],))
 elif cmd == 'getInfoJsonDirect':
    if len(args) != 2:
        print('getInfoJsonDirect requires 2 args')
        sys.exit(1)
    pp.pprint(client.getInfoJsonDirect(args[0], args[1],))
 elif cmd == 'getProxyStatus':
    if len(args) != 1:
        print('getProxyStatus requires 1 args')
        sys.exit(1)
    pp.pprint(client.getProxyStatus(args[0],))
 elif cmd == 'banProxy':
    if len(args) != 2:
        print('banProxy requires 2 args')
        sys.exit(1)
    pp.pprint(client.banProxy(args[0], args[1],))
 elif cmd == 'unbanProxy':
    if len(args) != 2:
        print('unbanProxy requires 2 args')
        sys.exit(1)
    pp.pprint(client.unbanProxy(args[0], args[1],))
 elif cmd == 'resetAllProxyStatuses':
    if len(args) != 1:
        print('resetAllProxyStatuses requires 1 args')
        sys.exit(1)
    pp.pprint(client.resetAllProxyStatuses(args[0],))
 elif cmd == 'banAllProxies':
    if len(args) != 1:
        print('banAllProxies requires 1 args')
        sys.exit(1)
    pp.pprint(client.banAllProxies(args[0],))
 elif cmd == 'deleteProxyFromRedis':
    if len(args) != 2:
        print('deleteProxyFromRedis requires 2 args')
        sys.exit(1)
    pp.pprint(client.deleteProxyFromRedis(args[0], args[1],))
 elif cmd == 'deleteAllProxiesFromRedis':
    if len(args) != 1:
        print('deleteAllProxiesFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAllProxiesFromRedis(args[0],))
 elif cmd == 'getAccountStatus':
    if len(args) != 2:
        print('getAccountStatus requires 2 args')
        sys.exit(1)
    pp.pprint(client.getAccountStatus(args[0], args[1],))
 elif cmd == 'banAccount':
    if len(args) != 2:
        print('banAccount requires 2 args')
        sys.exit(1)
    pp.pprint(client.banAccount(args[0], args[1],))
 elif cmd == 'unbanAccount':
    if len(args) != 2:
        print('unbanAccount requires 2 args')
        sys.exit(1)
    pp.pprint(client.unbanAccount(args[0], args[1],))
 elif cmd == 'deleteAccountFromRedis':
    if len(args) != 1:
        print('deleteAccountFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAccountFromRedis(args[0],))
 elif cmd == 'deleteAllAccountsFromRedis':
    if len(args) != 1:
        print('deleteAllAccountsFromRedis requires 1 args')
        sys.exit(1)
    pp.pprint(client.deleteAllAccountsFromRedis(args[0],))
 elif cmd == 'ping':
    if len(args) != 0:
        print('ping requires 0 args')
        sys.exit(1)
    pp.pprint(client.ping())
 elif cmd == 'reportError':
    if len(args) != 2:
        print('reportError requires 2 args')
        sys.exit(1)
    pp.pprint(client.reportError(args[0], eval(args[1]),))
 elif cmd == 'shutdown':
    if len(args) != 0:
        print('shutdown requires 0 args')
        sys.exit(1)
    pp.pprint(client.shutdown())
 else:
    print('Unrecognized method %s' % cmd)
    sys.exit(1)
 transport.close()
--- a/pangramia/yt/tokens_ops/YTTokenOpService.py
+++ b/pangramia/yt/tokens_ops/YTTokenOpService.py
--- a/pangramia/yt/tokens_ops/init.py
+++ b/pangramia/yt/tokens_ops/init.py
@ -0,0 +1 @@
 __all__ = ['ttypes', 'constants', 'YTTokenOpService']
--- a/pangramia/yt/tokens_ops/constants.py
+++ b/pangramia/yt/tokens_ops/constants.py
@ -0,0 +1,14 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 from .ttypes import *
--- a/pangramia/yt/tokens_ops/ttypes.py
+++ b/pangramia/yt/tokens_ops/ttypes.py
@ -0,0 +1,21 @@
 #
 # Autogenerated by Thrift Compiler (0.20.0)
 #
 # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 #
 #  options string: py
 #
 from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
 from thrift.protocol.TProtocol import TProtocolException
 from thrift.TRecursive import fix_spec
 import sys
 import pangramia.yt.common.ttypes
 import pangramia.yt.exceptions.ttypes
 import pangramia.yt.management.ttypes
 from thrift.transport import TTransport
 all_structs = []
 fix_spec(all_structs)
 del all_structs
--- a/playbooks/playbook-bgutils-start.yml
+++ b/playbooks/playbook-bgutils-start.yml
--- a/playbooks/playbook-bgutils-stop.yml
+++ b/playbooks/playbook-bgutils-stop.yml
--- a/policies/1_fetch_only_policies.yaml
+++ b/policies/1_fetch_only_policies.yaml
@ -0,0 +1,155 @@
 # This file contains policies for testing only the info.json generation step.
 # No downloads are performed.
 ---
 # Policy: Basic fetch-only test for a TV client.
 # This policy uses a single, static profile and has a rate limit to avoid being
 # too aggressive. It saves the generated info.json files to a directory.
 name: tv_downgraded_single_profile
 settings:
  mode: fetch_only
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  save_info_json_dir: "fetched_info_jsons/tv_downgraded"
  # Use a single, static profile for all requests.
  profile_prefix: "tv_downgraded_user"
  profile_mode: per_worker # With 1 worker, this is effectively a single profile.
 execution_control:
  run_until: { cycles: 1 }
  workers: 1
  sleep_between_tasks: { min_seconds: 5, max_seconds: 10 }
 info_json_generation_policy:
  client: tv_downgraded
  # Safety rate limit: 450 requests per hour (7.5 req/min)
  rate_limits:
    per_ip: { max_requests: 450, per_minutes: 60 }
 ---
 # Policy: Fetch-only test for an Android client using a cookie file.
 # This demonstrates how to pass a cookie file for authenticated requests.
 # It uses a single profile and stops if it encounters too many errors.
 name: android_sdkless_with_cookies
 settings:
  mode: fetch_only
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  save_info_json_dir: "fetched_info_jsons/android_sdkless"
  profile_prefix: "android_user_with_cookies"
  profile_mode: per_worker
 execution_control:
  run_until: { cycles: 1 } # Run through the URL list once.
  workers: 1
  sleep_between_tasks: { min_seconds: 2, max_seconds: 4 }
 info_json_generation_policy:
  client: android_sdkless
  # Pass per-request parameters. This is how you specify a cookie file.
  request_params:
    cookies_file_path: "/path/to/your/android_cookies.txt"
 stop_conditions:
  # Stop if we get more than 5 errors in any 10-minute window.
  on_error_rate: { max_errors: 5, per_minutes: 10 }
 ---
 # Policy: TV Fetch with Profile Cooldown (Pipeline Stage 1)
 # Fetches info.json files using the 'tv' client. Each profile is limited
 # to a certain number of requests before it is put into a cooldown period.
 # The output of this policy is intended to be used by a 'download_only' policy.
 name: tv_fetch_with_cooldown
 settings:
  mode: fetch_only
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  # Save the generated files to this directory for the download task to find.
  save_info_json_dir: "live_jsons_tv"
  profile_management:
    prefix: "tv_user"
    initial_pool_size: 10
    auto_expand_pool: true
    max_requests_per_profile: 60
    sleep_minutes_on_exhaustion: 60
 execution_control:
  run_until: { cycles: 1 }
  workers: 1
  sleep_between_tasks: { min_seconds: 2, max_seconds: 5 }
 info_json_generation_policy:
  client: "tv"
  request_params:
    context_reuse_policy: { enabled: true, max_age_seconds: 86400 }
 ---
 # Policy: MWeb with client rotation and rate limits.
 # This demonstrates a more complex scenario with multiple clients and strict
 # rate limiting, useful for simulating sophisticated user behavior.
 name: mweb_client_rotation_and_rate_limits
 settings:
  mode: fetch_only
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  # Use the dynamic profile pool management system.
  profile_management:
    prefix: "mweb_user"
    initial_pool_size: 10
    max_requests_per_profile: 100
    sleep_minutes_on_exhaustion: 15
 execution_control:
  run_until: { cycles: 1 }
  workers: 10
  sleep_between_tasks: { min_seconds: 2, max_seconds: 5 }
 info_json_generation_policy:
  # Enforce strict rate limits for both the entire IP and each individual profile.
  rate_limits:
    per_ip: { max_requests: 120, per_minutes: 10 }
    per_profile: { max_requests: 10, per_minutes: 10 }
  # Rotate between a primary client (mweb) and a refresh client (web_camoufox)
  # to keep sessions fresh.
  client_rotation_policy:
    major_client: "mweb"
    major_client_params:
      context_reuse_policy: { enabled: true, max_age_seconds: 1800 }
    refresh_client: "web_camoufox"
    refresh_every: { requests: 20, minutes: 10 }
 ---
 # Policy: TV Simply, fetch-only test with per-worker profile rotation.
 # Fetches info.json using tv_simply with multiple workers. Each worker gets a
 # unique profile that is retired and replaced with a new generation after a
 # set number of requests.
 name: tv_simply_fetch_rotation
 settings:
  mode: fetch_only
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  save_info_json_dir: "fetched_info_jsons/tv_simply_rotation"
  # Use the modern profile management system.
  profile_mode: per_worker_with_rotation
  profile_management:
    prefix: "tv_simply_user"
    # Rotate to a new profile generation after 250 requests.
    max_requests_per_profile: 250
 execution_control:
  run_until: { cycles: 1 } # Run through the URL list once.
  workers: 8 # Run with 8 parallel workers.
  sleep_between_tasks: { min_seconds: 2, max_seconds: 5 }
  # Optional: Override the assumed time for a fetch task to improve rate estimation.
  # The default is 3 seconds for fetch_only mode.
  # assumptions:
  #   fetch_task_duration: 2.5
 info_json_generation_policy:
  client: tv_simply
--- a/policies/2_download_only_policies.yaml
+++ b/policies/2_download_only_policies.yaml
@ -0,0 +1,58 @@
 # This file contains policies for testing only the download step from
 # existing info.json files. No new info.json files are generated.
 ---
 # Policy: Basic profile-aware download test.
 # This policy reads info.json files from a directory, groups them by a profile
 # name extracted from the filename, and downloads them using multiple workers.
 # Each worker handles one or more profiles sequentially.
 name: basic_profile_aware_download
 settings:
  mode: download_only
  info_json_dir: "prefetched_info_jsons"
  # Regex to extract profile names from filenames like '...-VIDEOID-my_profile_name.json'.
  profile_extraction_regex: ".*-[a-zA-Z0-9_-]{11}-(.+)\\.json"
 execution_control:
  run_until: { cycles: 1 }
  # 'auto' sets workers to the number of profiles, capped by auto_workers_max.
  workers: auto
  auto_workers_max: 8
  # This sleep applies between each file downloaded by a single profile.
  sleep_between_tasks: { min_seconds: 1, max_seconds: 2 }
 download_policy:
  formats: "18,140,299/298/137/136/135/134/133"
  downloader: "aria2c"
  downloader_args: "aria2c:-x 4 -k 1M"
  extra_args: "--cleanup --output-dir /tmp/downloads"
  # This sleep applies between formats of a single video.
  sleep_between_formats: { min_seconds: 0, max_seconds: 0 }
 ---
 # Policy: Continuous download from a folder (Pipeline Stage 2).
 # This policy watches a directory for new info.json files and processes them
 # as they appear. It is designed to work as the second stage of a pipeline,
 # consuming files generated by a 'fetch_only' policy like 'tv_fetch_with_cooldown'.
 name: continuous_watch_download
 settings:
  mode: download_only
  info_json_dir: "live_info_jsons"
  directory_scan_mode: continuous
  mark_processed_files: true # Rename files to *.processed to avoid re-downloading.
  max_files_per_cycle: 50 # Process up to 50 new files each time it checks.
  sleep_if_no_new_files_seconds: 15
 execution_control:
  # Note: For 'continuous' mode, a time-based run_until (e.g., {minutes: 120})
  # is more typical. {cycles: 1} will cause it to scan the directory once
  # for new files, process them, and then exit.
  run_until: { cycles: 1 }
  workers: 4 # Use a few workers to process files in parallel.
  sleep_between_tasks: { min_seconds: 0, max_seconds: 0 }
 download_policy:
  formats: "18,140"
  extra_args: "--cleanup --output-dir /tmp/downloads"
--- a/policies/3_full_stack_policies.yaml
+++ b/policies/3_full_stack_policies.yaml
@ -0,0 +1,158 @@
 # This file contains policies for full-stack tests, which include both
 # info.json generation and the subsequent download step.
 ---
 # Policy: TV client with profile rotation.
 # This test uses multiple parallel workers. Each worker gets its own profile
 # that is automatically rotated (e.g., from tv_user_0_0 to tv_user_0_1) after
 # a certain number of requests to simulate user churn.
 name: tv_simply_profile_rotation
 settings:
  mode: full_stack
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  save_info_json_dir: "fetched_info_jsons/tv_simply_rotation"
  # Use the modern profile management system.
  profile_mode: per_worker_with_rotation
  profile_management:
    prefix: "tv_simply"
    # Rotate to a new profile generation after 250 requests.
    max_requests_per_profile: 250
 execution_control:
  run_until: { cycles: 1 }
  workers: 8 # Run with 8 parallel workers.
  sleep_between_tasks: { min_seconds: 2, max_seconds: 5 }
  # Optional: Override assumptions to improve rate estimation.
  # assumptions:
  #   fetch_task_duration: 10 # Est. seconds to get info.json
  #   download_task_duration: 20 # Est. seconds to download all formats for one video
 info_json_generation_policy:
  client: tv_simply
 download_policy:
  formats: "18,140"
  extra_args: "--cleanup --output-dir downloads/tv_simply_rotation"
  proxy: "socks5://127.0.0.1:1087"
  downloader: "aria2c"
  downloader_args: "aria2c:-x 8 -k 1M"
  sleep_between_formats: { min_seconds: 2, max_seconds: 2 }
 stop_conditions:
  on_cumulative_403: { max_errors: 5, per_minutes: 2 }
 ---
 # Policy: TV Simply, full-stack test with per-worker profile rotation.
 # Generates info.json using tv_simply and immediately attempts to download.
 # This combines the fetch and download steps into a single workflow.
 name: tv_simply_full_stack_rotation
 settings:
  mode: full_stack
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  profile_mode: per_worker_with_rotation
  profile_management:
    prefix: "tv_simply_worker"
    max_requests_per_profile: 240
 execution_control:
  workers: 10
  run_until: { cycles: 1 }
  sleep_between_tasks: { min_seconds: 5, max_seconds: 5 }
 info_json_generation_policy:
  client: "tv_simply"
  request_params:
    context_reuse_policy: { enabled: false }
 download_policy:
  formats: "18,140"
  extra_args: "--output-dir downloads/tv_simply_downloads"
 ---
 # Policy: MWeb client with multiple profiles, each with its own cookie file.
 # This demonstrates how to run an authenticated test with a pool of accounts.
 # The orchestrator will cycle through the cookie files, assigning one to each profile.
 name: mweb_multi_profile_with_cookies
 settings:
  mode: full_stack
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  # Use the dynamic profile pool management system.
  profile_management:
    prefix: "mweb_user"
    initial_pool_size: 3 # Start with 3 profiles.
    auto_expand_pool: true # Create new profiles if the initial 3 are all rate-limited.
    max_requests_per_profile: 100 # Let each profile make 100 requests...
    sleep_minutes_on_exhaustion: 15 # ...then put it to sleep for 15 minutes.
    # Assign a different cookie file to each profile in the pool.
    # The tool will cycle through this list.
    cookie_files:
      - "/path/to/your/mweb_cookies_0.txt"
      - "/path/to/your/mweb_cookies_1.txt"
      - "/path/to/your/mweb_cookies_2.txt"
 execution_control:
  run_until: { cycles: 1 }
  workers: 3 # Match workers to the number of initial profiles.
  sleep_between_tasks: { min_seconds: 1, max_seconds: 3 }
 info_json_generation_policy:
  client: mweb
  # This client uses youtubei.js, which generates PO tokens.
 download_policy:
  formats: "18,140"
  extra_args: "--cleanup --output-dir /tmp/downloads"
 ---
 # Policy: TV client with profile rotation and aria2c RPC download.
 # This test uses multiple parallel workers. Each worker gets its own profile
 # that is automatically rotated. Downloads are submitted to an aria2c daemon
 # via its RPC interface.
 name: tv_simply_profile_rotation_aria2c_rpc
 settings:
  mode: full_stack
  urls_file: "urls.txt"
  info_json_script: "bin/ytops-client get-info"
  save_info_json_dir: "fetched_info_jsons/tv_simply_rotation_aria"
  profile_mode: per_worker_with_rotation
  profile_management:
    prefix: "tv_simply_aria"
    max_requests_per_profile: 250
 execution_control:
  run_until: { cycles: 1 }
  workers: 8
  sleep_between_tasks: { min_seconds: 2, max_seconds: 5 }
 info_json_generation_policy:
  client: tv_simply
 download_policy:
  formats: "18,140"
  # Use the aria2c RPC downloader
  downloader: "aria2c_rpc"
  # RPC server connection details
  aria_host: "localhost"
  aria_port: 6800
  # aria_secret: "your_secret" # Uncomment and set if needed
  # Set to true to wait for each download and get a success/fail result.
  # This is the default and recommended for monitoring success/failure.
  # Set to false for maximum submission throughput ("fire-and-forget"),
  # but you will lose per-download status reporting.
  aria_wait: true
  # The output directory is on the aria2c host machine
  output_dir: "/downloads/tv_simply_rotation_aria"
  # Pass custom arguments to aria2c in yt-dlp format for better performance.
  # -x: max connections per server, -k: min split size.
  downloader_args: "aria2c:[-x 8, -k 1M]"
  sleep_between_formats: { min_seconds: 1, max_seconds: 2 }
 stop_conditions:
  on_cumulative_403: { max_errors: 5, per_minutes: 2 }
--- a/policies/README.md
+++ b/policies/README.md
@ -0,0 +1,28 @@
 # Stress Test Policies
 This directory contains example policy files for the `stress_enhanced.py` orchestrator. Each file defines a specific testing strategy, organized by task type.
 ## Authentication & Info.json Policies (`fetch_only` mode)
 These policies focus on testing the info.json generation service.
 - `info_json_rate_limit.yaml`: Tests the service with a focus on rate limits and client rotation.
 - `auth_scenarios.yaml`: Contains specific scenarios for fetching info.json files, such as using a low-level command template for full control.
 ## Download Policies (`download_only` mode)
 These policies focus on testing the download infrastructure using pre-existing info.json files.
 - `download_throughput.yaml`: Tests download/CDN infrastructure, focusing on throughput and error handling.
 - `download_scenarios.yaml`: Contains specific scenarios for downloading, such as testing random formats from a directory of info.json files.
 ## Full-Stack Policies (`full_stack` mode)
 These policies test the entire workflow from info.json generation through to downloading.
 - `regular_testing_scenarios.yaml`: Contains a collection of common, end-to-end testing scenarios, including:
  - `mweb_per_request_profile`: A high-volume test that uses a new profile for every request.
  - `mixed_client_profile_pool`: A complex test that alternates clients and reuses profiles from a pool.
 - `tv_pipeline_scenarios.yaml`: A two-stage pipeline for fetching with the TV client and then continuously downloading.
 These files can be used as templates for creating custom test scenarios.
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,6 @@ setup(
        'psutil',
        'flask',
        'waitress',
        'yt_dlp>=2025.3.27',
        'yt-dlp-get-pot==0.3.0',
        'requests>=2.31.0',
        'ffprobe3',
--- a/thrift_model/.gitignore
+++ b/thrift_model/.gitignore
@ -1 +1,2 @@
 __py_cache__
 target/
--- a/thrift_model/data/common.thrift
+++ b/thrift_model/data/common.thrift
@ -0,0 +1,145 @@
 namespace py pangramia.yt.common
 namespace java com.pangramia.yt.common
 typedef string JobID
 typedef string Timestamp 
 /**
 * Standard error codes for service exceptions.
 */
 enum ErrorCode {
    UNKNOWN = 0,
    NOT_IMPLEMENTED = 1,
    INTERNAL_ERROR = 2,
    INVALID_REQUEST = 3,
    PROXY_UNAVAILABLE = 4,
    ACCOUNT_UNAVAILABLE = 5,
    BOT_DETECTED = 6,
    BOT_DETECTION_SIGN_IN_REQUIRED = 7,
    SABR_STREAMING_DETECTED = 8
 }
 enum JobState {
    SUCCESS,
    FAIL, 
    BOT_FORBIDDEN_ON_URL_ACCESS,
    BOT_FORBIDDEN_ON_FILE_DOWNLOAD,
    BOT_CAPTCHA,
    BOT_AUTH_RELOGIN_REQUIRED,
    BOT_AUTH_SMS_REQUIRED,
    BOT_AUTH_DEVICE_QR_REQUIRED,    
    BOT_ACCOUNT_BANNED,
    BOT_IP_BANNED
 }
 struct JobTokenData {
    1: optional string infoJson,
    2: optional string ytdlpCommand,
    3: optional string socks,
    4: optional JobID jobId,
    5: optional string url,
    6: optional string cookiesBlob,
    7: optional string requestSummary,
    8: optional list<string> communicationLogPaths,
    9: optional string serverVersionInfo,
 }
 enum TokenUpdateMode {
    AUTOREFRESH_AND_REMAIN_ANONYMOUS,
    AUTOREFRESH_AND_ALLOW_AUTH,
    AUTOREFRESH_AND_ONLY_AUTH,
    CLEANUP_THEN_AUTOREFRESH_AND_ONLY_AUTH,
    CLEANUP_THEN_AUTOREFRESH_AND_REMAIN_ANONYMOUS,    
    CLEANUP_THEN_AUTOREFRESH_AND_ALLOW_AUTH,
    AUTO,//   AUTOREFRESH_AND_ONLY_AUTH,
 }
 struct AccountData {
    1: required string username,
    2: required string password,
    3: optional string countryCode
 }
 struct ProxyData {
    1: required string proxyUrl,
    2: optional string countryCode
 }
 enum AccountPairState {
    ACTIVE,
    PAUSED,
    REMOVED,
    IN_PROGRESS,
    ALL
 }
 struct AccountPairWithState {
    1: required string accountId,
    2: required string proxyId,
    3: optional AccountPairState accountPairState
    4: optional string machineId,
 }
 struct JobData {
    1: required string jobId,
    2: required string url,
    3: required string cookiesBlob,
    4: required string potoken,
    5: required string visitorId,
    6: required string ytdlpCommand,
    7: required string createdTime,
    8: required map<string,string> telemetry,
    9: required JobState state,
    10: optional string errorMessage,
    11: optional string socks5Id
 }
 struct RichCollectionPagination {
    1: required bool hasNext,
    2: required i32 totalCount,
    3: required i32 page,
    4: required i32 pageSize
 }
 struct RichCollectionJobData {
    1: required list<JobData> items,
    2: required RichCollectionPagination pagination
 }
 struct ProxyStatus {
    1: string proxyUrl,
    2: string status,
    3: i64 successCount,
    4: i64 failureCount,
    5: optional string lastFailureTimestamp,
    6: optional string lastSuccessTimestamp,
    7: optional string serverIdentity
 }
 struct AccountStatus {
    1: string accountId,
    2: string status,
    3: i64 successCount,
    4: i64 failureCount,
    5: optional string lastFailureTimestamp,
    6: optional string lastSuccessTimestamp,
    7: optional string lastUsedProxy,
    8: optional string lastUsedMachine
 }
 struct AirflowLogContext {
    1: optional string logS3Path,
    2: optional string dagId,
    3: optional string runId,
    4: optional string taskId,
    5: optional i32 tryNumber,
    6: optional string workerHostname,
    7: optional string queue
 }
--- a/thrift_model/data/exceptions.thrift
+++ b/thrift_model/data/exceptions.thrift
@ -0,0 +1,14 @@
 namespace py pangramia.yt.exceptions
 namespace java com.pangramia.yt.exceptions
 exception PBServiceException {
    1: required string message,
    2: optional string errorCode,
    3: optional map<string, string> context
 }
 exception PBUserException {
    1: required string message,
    2: optional string errorCode,
    3: optional map<string, string> context
 }
--- a/thrift_model/gen_py/pangramia/yt/common/ttypes.py
+++ b/thrift_model/gen_py/pangramia/yt/common/ttypes.py
@ -29,6 +29,7 @@ class ErrorCode(object):
    ACCOUNT_UNAVAILABLE = 5
    BOT_DETECTED = 6
    BOT_DETECTION_SIGN_IN_REQUIRED = 7
    SABR_STREAMING_DETECTED = 8
    _VALUES_TO_NAMES = {
        0: "UNKNOWN",
@ -39,6 +40,7 @@ class ErrorCode(object):
        5: "ACCOUNT_UNAVAILABLE",
        6: "BOT_DETECTED",
        7: "BOT_DETECTION_SIGN_IN_REQUIRED",
        8: "SABR_STREAMING_DETECTED",
    }
    _NAMES_TO_VALUES = {
@ -50,6 +52,7 @@ class ErrorCode(object):
        "ACCOUNT_UNAVAILABLE": 5,
        "BOT_DETECTED": 6,
        "BOT_DETECTION_SIGN_IN_REQUIRED": 7,
        "SABR_STREAMING_DETECTED": 8,
    }
@ -155,17 +158,23 @@ class JobTokenData(object):
     - jobId
     - url
     - cookiesBlob
     - requestSummary
     - communicationLogPaths
     - serverVersionInfo
    """
-    def __init__(self, infoJson=None, ytdlpCommand=None, socks=None, jobId=None, url=None, cookiesBlob=None,):
+    def __init__(self, infoJson=None, ytdlpCommand=None, socks=None, jobId=None, url=None, cookiesBlob=None, requestSummary=None, communicationLogPaths=None, serverVersionInfo=None,):
        self.infoJson = infoJson
        self.ytdlpCommand = ytdlpCommand
        self.socks = socks
        self.jobId = jobId
        self.url = url
        self.cookiesBlob = cookiesBlob
        self.requestSummary = requestSummary
        self.communicationLogPaths = communicationLogPaths
        self.serverVersionInfo = serverVersionInfo
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
@ -206,6 +215,26 @@ class JobTokenData(object):
                    self.cookiesBlob = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 7:
                if ftype == TType.STRING:
                    self.requestSummary = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 8:
                if ftype == TType.LIST:
                    self.communicationLogPaths = []
                    (_etype3, _size0) = iprot.readListBegin()
                    for _i4 in range(_size0):
                        _elem5 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                        self.communicationLogPaths.append(_elem5)
                    iprot.readListEnd()
                else:
                    iprot.skip(ftype)
            elif fid == 9:
                if ftype == TType.STRING:
                    self.serverVersionInfo = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
@ -240,6 +269,21 @@ class JobTokenData(object):
            oprot.writeFieldBegin('cookiesBlob', TType.STRING, 6)
            oprot.writeString(self.cookiesBlob.encode('utf-8') if sys.version_info[0] == 2 else self.cookiesBlob)
            oprot.writeFieldEnd()
        if self.requestSummary is not None:
            oprot.writeFieldBegin('requestSummary', TType.STRING, 7)
            oprot.writeString(self.requestSummary.encode('utf-8') if sys.version_info[0] == 2 else self.requestSummary)
            oprot.writeFieldEnd()
        if self.communicationLogPaths is not None:
            oprot.writeFieldBegin('communicationLogPaths', TType.LIST, 8)
            oprot.writeListBegin(TType.STRING, len(self.communicationLogPaths))
            for iter6 in self.communicationLogPaths:
                oprot.writeString(iter6.encode('utf-8') if sys.version_info[0] == 2 else iter6)
            oprot.writeListEnd()
            oprot.writeFieldEnd()
        if self.serverVersionInfo is not None:
            oprot.writeFieldBegin('serverVersionInfo', TType.STRING, 9)
            oprot.writeString(self.serverVersionInfo.encode('utf-8') if sys.version_info[0] == 2 else self.serverVersionInfo)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
@ -583,11 +627,11 @@ class JobData(object):
            elif fid == 8:
                if ftype == TType.MAP:
                    self.telemetry = {}
-                    (_ktype1, _vtype2, _size0) = iprot.readMapBegin()
+                    (_ktype8, _vtype9, _size7) = iprot.readMapBegin()
-                    for _i4 in range(_size0):
+                    for _i11 in range(_size7):
-                        _key5 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
+                        _key12 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
-                        _val6 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
+                        _val13 = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
-                        self.telemetry[_key5] = _val6
+                        self.telemetry[_key12] = _val13
                    iprot.readMapEnd()
                else:
                    iprot.skip(ftype)
@ -647,9 +691,9 @@ class JobData(object):
        if self.telemetry is not None:
            oprot.writeFieldBegin('telemetry', TType.MAP, 8)
            oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.telemetry))
-            for kiter7, viter8 in self.telemetry.items():
+            for kiter14, viter15 in self.telemetry.items():
-                oprot.writeString(kiter7.encode('utf-8') if sys.version_info[0] == 2 else kiter7)
+                oprot.writeString(kiter14.encode('utf-8') if sys.version_info[0] == 2 else kiter14)
-                oprot.writeString(viter8.encode('utf-8') if sys.version_info[0] == 2 else viter8)
+                oprot.writeString(viter15.encode('utf-8') if sys.version_info[0] == 2 else viter15)
            oprot.writeMapEnd()
            oprot.writeFieldEnd()
        if self.state is not None:
@ -823,11 +867,11 @@ class RichCollectionJobData(object):
            if fid == 1:
                if ftype == TType.LIST:
                    self.items = []
-                    (_etype12, _size9) = iprot.readListBegin()
+                    (_etype19, _size16) = iprot.readListBegin()
-                    for _i13 in range(_size9):
+                    for _i20 in range(_size16):
-                        _elem14 = JobData()
+                        _elem21 = JobData()
-                        _elem14.read(iprot)
+                        _elem21.read(iprot)
-                        self.items.append(_elem14)
+                        self.items.append(_elem21)
                    iprot.readListEnd()
                else:
                    iprot.skip(ftype)
@ -850,8 +894,8 @@ class RichCollectionJobData(object):
        if self.items is not None:
            oprot.writeFieldBegin('items', TType.LIST, 1)
            oprot.writeListBegin(TType.STRUCT, len(self.items))
-            for iter15 in self.items:
+            for iter22 in self.items:
-                iter15.write(oprot)
+                iter22.write(oprot)
            oprot.writeListEnd()
            oprot.writeFieldEnd()
        if self.pagination is not None:
@ -1135,6 +1179,129 @@ class AccountStatus(object):
    def __ne__(self, other):
        return not (self == other)
 class AirflowLogContext(object):
    """
    Attributes:
     - logS3Path
     - dagId
     - runId
     - taskId
     - tryNumber
     - workerHostname
     - queue
    """
    def __init__(self, logS3Path=None, dagId=None, runId=None, taskId=None, tryNumber=None, workerHostname=None, queue=None,):
        self.logS3Path = logS3Path
        self.dagId = dagId
        self.runId = runId
        self.taskId = taskId
        self.tryNumber = tryNumber
        self.workerHostname = workerHostname
        self.queue = queue
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 1:
                if ftype == TType.STRING:
                    self.logS3Path = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRING:
                    self.dagId = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 3:
                if ftype == TType.STRING:
                    self.runId = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 4:
                if ftype == TType.STRING:
                    self.taskId = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 5:
                if ftype == TType.I32:
                    self.tryNumber = iprot.readI32()
                else:
                    iprot.skip(ftype)
            elif fid == 6:
                if ftype == TType.STRING:
                    self.workerHostname = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 7:
                if ftype == TType.STRING:
                    self.queue = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('AirflowLogContext')
        if self.logS3Path is not None:
            oprot.writeFieldBegin('logS3Path', TType.STRING, 1)
            oprot.writeString(self.logS3Path.encode('utf-8') if sys.version_info[0] == 2 else self.logS3Path)
            oprot.writeFieldEnd()
        if self.dagId is not None:
            oprot.writeFieldBegin('dagId', TType.STRING, 2)
            oprot.writeString(self.dagId.encode('utf-8') if sys.version_info[0] == 2 else self.dagId)
            oprot.writeFieldEnd()
        if self.runId is not None:
            oprot.writeFieldBegin('runId', TType.STRING, 3)
            oprot.writeString(self.runId.encode('utf-8') if sys.version_info[0] == 2 else self.runId)
            oprot.writeFieldEnd()
        if self.taskId is not None:
            oprot.writeFieldBegin('taskId', TType.STRING, 4)
            oprot.writeString(self.taskId.encode('utf-8') if sys.version_info[0] == 2 else self.taskId)
            oprot.writeFieldEnd()
        if self.tryNumber is not None:
            oprot.writeFieldBegin('tryNumber', TType.I32, 5)
            oprot.writeI32(self.tryNumber)
            oprot.writeFieldEnd()
        if self.workerHostname is not None:
            oprot.writeFieldBegin('workerHostname', TType.STRING, 6)
            oprot.writeString(self.workerHostname.encode('utf-8') if sys.version_info[0] == 2 else self.workerHostname)
            oprot.writeFieldEnd()
        if self.queue is not None:
            oprot.writeFieldBegin('queue', TType.STRING, 7)
            oprot.writeString(self.queue.encode('utf-8') if sys.version_info[0] == 2 else self.queue)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(JobTokenData)
 JobTokenData.thrift_spec = (
    None,  # 0
@ -1144,6 +1311,9 @@ JobTokenData.thrift_spec = (
    (4, TType.STRING, 'jobId', 'UTF8', None, ),  # 4
    (5, TType.STRING, 'url', 'UTF8', None, ),  # 5
    (6, TType.STRING, 'cookiesBlob', 'UTF8', None, ),  # 6
    (7, TType.STRING, 'requestSummary', 'UTF8', None, ),  # 7
    (8, TType.LIST, 'communicationLogPaths', (TType.STRING, 'UTF8', False), None, ),  # 8
    (9, TType.STRING, 'serverVersionInfo', 'UTF8', None, ),  # 9
 )
 all_structs.append(AccountData)
 AccountData.thrift_spec = (
@ -1218,5 +1388,16 @@ AccountStatus.thrift_spec = (
    (7, TType.STRING, 'lastUsedProxy', 'UTF8', None, ),  # 7
    (8, TType.STRING, 'lastUsedMachine', 'UTF8', None, ),  # 8
 )
 all_structs.append(AirflowLogContext)
 AirflowLogContext.thrift_spec = (
    None,  # 0
    (1, TType.STRING, 'logS3Path', 'UTF8', None, ),  # 1
    (2, TType.STRING, 'dagId', 'UTF8', None, ),  # 2
    (3, TType.STRING, 'runId', 'UTF8', None, ),  # 3
    (4, TType.STRING, 'taskId', 'UTF8', None, ),  # 4
    (5, TType.I32, 'tryNumber', None, None, ),  # 5
    (6, TType.STRING, 'workerHostname', 'UTF8', None, ),  # 6
    (7, TType.STRING, 'queue', 'UTF8', None, ),  # 7
 )
 fix_spec(all_structs)
 del all_structs
--- a/thrift_model/gen_py/pangramia/yt/tokens_ops/YTTokenOpService-remote
+++ b/thrift_model/gen_py/pangramia/yt/tokens_ops/YTTokenOpService-remote
@ -24,11 +24,12 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
    print('Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] [-s[sl]] [-novalidate] [-ca_certs certs] [-keyfile keyfile] [-certfile certfile] function [arg1 [arg2...]]')
    print('')
    print('Functions:')
-    print('  JobTokenData getOrRefreshTokenWithReport(string accountId, string oldUrl, JobState status, string details, string jobId, TokenUpdateMode updateType, string url, string clients)')
+    print('  JobTokenData getOrRefreshTokenWithReport(string accountId, string oldUrl, JobState status, string details, string jobId, TokenUpdateMode updateType, string url, string clients, AirflowLogContext airflowLogContext, string requestParamsJson)')
-    print('  JobTokenData getOrRefreshToken(string accountId, TokenUpdateMode updateType, string url, string clients, string machineId)')
+    print('  JobTokenData getOrRefreshToken(string accountId, TokenUpdateMode updateType, string url, string clients, string machineId, AirflowLogContext airflowLogContext, string requestParamsJson, string assignedProxyUrl)')
    print('  JobTokenData getLatestToken(string accountId)')
    print('  JobTokenData refreshToken(string accountId, TokenUpdateMode updateType, string url)')
    print('  bool reportState(string url, JobState status, string details, string jobId)')
    print('  JobTokenData getInfoJsonDirect(string url, string clients)')
    print('   getProxyStatus(string serverIdentity)')
    print('  bool banProxy(string proxyUrl, string serverIdentity)')
    print('  bool unbanProxy(string proxyUrl, string serverIdentity)')
@ -124,16 +125,16 @@ client = YTTokenOpService.Client(protocol)
 transport.open()
 if cmd == 'getOrRefreshTokenWithReport':
-    if len(args) != 8:
+    if len(args) != 10:
-        print('getOrRefreshTokenWithReport requires 8 args')
+        print('getOrRefreshTokenWithReport requires 10 args')
        sys.exit(1)
-    pp.pprint(client.getOrRefreshTokenWithReport(args[0], args[1], eval(args[2]), args[3], args[4], eval(args[5]), args[6], args[7],))
+    pp.pprint(client.getOrRefreshTokenWithReport(args[0], args[1], eval(args[2]), args[3], args[4], eval(args[5]), args[6], args[7], eval(args[8]), args[9],))
 elif cmd == 'getOrRefreshToken':
-    if len(args) != 5:
+    if len(args) != 8:
-        print('getOrRefreshToken requires 5 args')
+        print('getOrRefreshToken requires 8 args')
        sys.exit(1)
-    pp.pprint(client.getOrRefreshToken(args[0], eval(args[1]), args[2], args[3], args[4],))
+    pp.pprint(client.getOrRefreshToken(args[0], eval(args[1]), args[2], args[3], args[4], eval(args[5]), args[6], args[7],))
 elif cmd == 'getLatestToken':
    if len(args) != 1:
@ -153,6 +154,12 @@ elif cmd == 'reportState':
        sys.exit(1)
    pp.pprint(client.reportState(args[0], eval(args[1]), args[2], args[3],))
 elif cmd == 'getInfoJsonDirect':
    if len(args) != 2:
        print('getInfoJsonDirect requires 2 args')
        sys.exit(1)
    pp.pprint(client.getInfoJsonDirect(args[0], args[1],))
 elif cmd == 'getProxyStatus':
    if len(args) != 1:
        print('getProxyStatus requires 1 args')
--- a/thrift_model/gen_py/pangramia/yt/tokens_ops/YTTokenOpService.py
+++ b/thrift_model/gen_py/pangramia/yt/tokens_ops/YTTokenOpService.py
@ -20,7 +20,7 @@ all_structs = []
 class Iface(pangramia.yt.management.YTManagementService.Iface):
-    def getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients):
+    def getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients, airflowLogContext, requestParamsJson):
        """
        Parameters:
         - accountId
@ -31,11 +31,13 @@ class Iface(pangramia.yt.management.YTManagementService.Iface):
         - updateType
         - url
         - clients
         - airflowLogContext
         - requestParamsJson
        """
        pass
-    def getOrRefreshToken(self, accountId, updateType, url, clients, machineId):
+    def getOrRefreshToken(self, accountId, updateType, url, clients, machineId, airflowLogContext, requestParamsJson, assignedProxyUrl):
        """
        Parameters:
         - accountId
@ -43,6 +45,9 @@ class Iface(pangramia.yt.management.YTManagementService.Iface):
         - url
         - clients
         - machineId
         - airflowLogContext
         - requestParamsJson
         - assignedProxyUrl
        """
        pass
@ -76,12 +81,21 @@ class Iface(pangramia.yt.management.YTManagementService.Iface):
        """
        pass
    def getInfoJsonDirect(self, url, clients):
        """
        Parameters:
         - url
         - clients
        """
        pass
 class Client(pangramia.yt.management.YTManagementService.Client, Iface):
    def __init__(self, iprot, oprot=None):
        pangramia.yt.management.YTManagementService.Client.__init__(self, iprot, oprot)
-    def getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients):
+    def getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients, airflowLogContext, requestParamsJson):
        """
        Parameters:
         - accountId
@ -92,12 +106,14 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
         - updateType
         - url
         - clients
         - airflowLogContext
         - requestParamsJson
        """
-        self.send_getOrRefreshTokenWithReport(accountId, oldUrl, status, details, jobId, updateType, url, clients)
+        self.send_getOrRefreshTokenWithReport(accountId, oldUrl, status, details, jobId, updateType, url, clients, airflowLogContext, requestParamsJson)
        return self.recv_getOrRefreshTokenWithReport()
-    def send_getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients):
+    def send_getOrRefreshTokenWithReport(self, accountId, oldUrl, status, details, jobId, updateType, url, clients, airflowLogContext, requestParamsJson):
        self._oprot.writeMessageBegin('getOrRefreshTokenWithReport', TMessageType.CALL, self._seqid)
        args = getOrRefreshTokenWithReport_args()
        args.accountId = accountId
@ -108,6 +124,8 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
        args.updateType = updateType
        args.url = url
        args.clients = clients
        args.airflowLogContext = airflowLogContext
        args.requestParamsJson = requestParamsJson
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
@ -131,7 +149,7 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
            raise result.userExp
        raise TApplicationException(TApplicationException.MISSING_RESULT, "getOrRefreshTokenWithReport failed: unknown result")
-    def getOrRefreshToken(self, accountId, updateType, url, clients, machineId):
+    def getOrRefreshToken(self, accountId, updateType, url, clients, machineId, airflowLogContext, requestParamsJson, assignedProxyUrl):
        """
        Parameters:
         - accountId
@ -139,12 +157,15 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
         - url
         - clients
         - machineId
         - airflowLogContext
         - requestParamsJson
         - assignedProxyUrl
        """
-        self.send_getOrRefreshToken(accountId, updateType, url, clients, machineId)
+        self.send_getOrRefreshToken(accountId, updateType, url, clients, machineId, airflowLogContext, requestParamsJson, assignedProxyUrl)
        return self.recv_getOrRefreshToken()
-    def send_getOrRefreshToken(self, accountId, updateType, url, clients, machineId):
+    def send_getOrRefreshToken(self, accountId, updateType, url, clients, machineId, airflowLogContext, requestParamsJson, assignedProxyUrl):
        self._oprot.writeMessageBegin('getOrRefreshToken', TMessageType.CALL, self._seqid)
        args = getOrRefreshToken_args()
        args.accountId = accountId
@ -152,6 +173,9 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
        args.url = url
        args.clients = clients
        args.machineId = machineId
        args.airflowLogContext = airflowLogContext
        args.requestParamsJson = requestParamsJson
        args.assignedProxyUrl = assignedProxyUrl
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
@ -293,6 +317,44 @@ class Client(pangramia.yt.management.YTManagementService.Client, Iface):
            raise result.userExp
        raise TApplicationException(TApplicationException.MISSING_RESULT, "reportState failed: unknown result")
    def getInfoJsonDirect(self, url, clients):
        """
        Parameters:
         - url
         - clients
        """
        self.send_getInfoJsonDirect(url, clients)
        return self.recv_getInfoJsonDirect()
    def send_getInfoJsonDirect(self, url, clients):
        self._oprot.writeMessageBegin('getInfoJsonDirect', TMessageType.CALL, self._seqid)
        args = getInfoJsonDirect_args()
        args.url = url
        args.clients = clients
        args.write(self._oprot)
        self._oprot.writeMessageEnd()
        self._oprot.trans.flush()
    def recv_getInfoJsonDirect(self):
        iprot = self._iprot
        (fname, mtype, rseqid) = iprot.readMessageBegin()
        if mtype == TMessageType.EXCEPTION:
            x = TApplicationException()
            x.read(iprot)
            iprot.readMessageEnd()
            raise x
        result = getInfoJsonDirect_result()
        result.read(iprot)
        iprot.readMessageEnd()
        if result.success is not None:
            return result.success
        if result.serviceExp is not None:
            raise result.serviceExp
        if result.userExp is not None:
            raise result.userExp
        raise TApplicationException(TApplicationException.MISSING_RESULT, "getInfoJsonDirect failed: unknown result")
 class Processor(pangramia.yt.management.YTManagementService.Processor, Iface, TProcessor):
    def __init__(self, handler):
@ -302,6 +364,7 @@ class Processor(pangramia.yt.management.YTManagementService.Processor, Iface, TP
        self._processMap["getLatestToken"] = Processor.process_getLatestToken
        self._processMap["refreshToken"] = Processor.process_refreshToken
        self._processMap["reportState"] = Processor.process_reportState
        self._processMap["getInfoJsonDirect"] = Processor.process_getInfoJsonDirect
        self._on_message_begin = None
    def on_message_begin(self, func):
@ -330,7 +393,7 @@ class Processor(pangramia.yt.management.YTManagementService.Processor, Iface, TP
        iprot.readMessageEnd()
        result = getOrRefreshTokenWithReport_result()
        try:
-            result.success = self._handler.getOrRefreshTokenWithReport(args.accountId, args.oldUrl, args.status, args.details, args.jobId, args.updateType, args.url, args.clients)
+            result.success = self._handler.getOrRefreshTokenWithReport(args.accountId, args.oldUrl, args.status, args.details, args.jobId, args.updateType, args.url, args.clients, args.airflowLogContext, args.requestParamsJson)
            msg_type = TMessageType.REPLY
        except TTransport.TTransportException:
            raise
@ -359,7 +422,7 @@ class Processor(pangramia.yt.management.YTManagementService.Processor, Iface, TP
        iprot.readMessageEnd()
        result = getOrRefreshToken_result()
        try:
-            result.success = self._handler.getOrRefreshToken(args.accountId, args.updateType, args.url, args.clients, args.machineId)
+            result.success = self._handler.getOrRefreshToken(args.accountId, args.updateType, args.url, args.clients, args.machineId, args.airflowLogContext, args.requestParamsJson, args.assignedProxyUrl)
            msg_type = TMessageType.REPLY
        except TTransport.TTransportException:
            raise
@ -469,6 +532,35 @@ class Processor(pangramia.yt.management.YTManagementService.Processor, Iface, TP
        oprot.writeMessageEnd()
        oprot.trans.flush()
    def process_getInfoJsonDirect(self, seqid, iprot, oprot):
        args = getInfoJsonDirect_args()
        args.read(iprot)
        iprot.readMessageEnd()
        result = getInfoJsonDirect_result()
        try:
            result.success = self._handler.getInfoJsonDirect(args.url, args.clients)
            msg_type = TMessageType.REPLY
        except TTransport.TTransportException:
            raise
        except pangramia.yt.exceptions.ttypes.PBServiceException as serviceExp:
            msg_type = TMessageType.REPLY
            result.serviceExp = serviceExp
        except pangramia.yt.exceptions.ttypes.PBUserException as userExp:
            msg_type = TMessageType.REPLY
            result.userExp = userExp
        except TApplicationException as ex:
            logging.exception('TApplication exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = ex
        except Exception:
            logging.exception('Unexpected exception in handler')
            msg_type = TMessageType.EXCEPTION
            result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error')
        oprot.writeMessageBegin("getInfoJsonDirect", msg_type, seqid)
        result.write(oprot)
        oprot.writeMessageEnd()
        oprot.trans.flush()
 # HELPER FUNCTIONS AND STRUCTURES
@ -483,11 +575,13 @@ class getOrRefreshTokenWithReport_args(object):
     - updateType
     - url
     - clients
     - airflowLogContext
     - requestParamsJson
    """
-    def __init__(self, accountId=None, oldUrl=None, status=None, details=None, jobId=None, updateType=    6, url=None, clients=None,):
+    def __init__(self, accountId=None, oldUrl=None, status=None, details=None, jobId=None, updateType=    6, url=None, clients=None, airflowLogContext=None, requestParamsJson=None,):
        self.accountId = accountId
        self.oldUrl = oldUrl
        self.status = status
@ -496,6 +590,8 @@ class getOrRefreshTokenWithReport_args(object):
        self.updateType = updateType
        self.url = url
        self.clients = clients
        self.airflowLogContext = airflowLogContext
        self.requestParamsJson = requestParamsJson
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
@ -546,6 +642,17 @@ class getOrRefreshTokenWithReport_args(object):
                    self.clients = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 9:
                if ftype == TType.STRUCT:
                    self.airflowLogContext = pangramia.yt.common.ttypes.AirflowLogContext()
                    self.airflowLogContext.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 10:
                if ftype == TType.STRING:
                    self.requestParamsJson = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
@ -588,6 +695,14 @@ class getOrRefreshTokenWithReport_args(object):
            oprot.writeFieldBegin('clients', TType.STRING, 8)
            oprot.writeString(self.clients.encode('utf-8') if sys.version_info[0] == 2 else self.clients)
            oprot.writeFieldEnd()
        if self.airflowLogContext is not None:
            oprot.writeFieldBegin('airflowLogContext', TType.STRUCT, 9)
            self.airflowLogContext.write(oprot)
            oprot.writeFieldEnd()
        if self.requestParamsJson is not None:
            oprot.writeFieldBegin('requestParamsJson', TType.STRING, 10)
            oprot.writeString(self.requestParamsJson.encode('utf-8') if sys.version_info[0] == 2 else self.requestParamsJson)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
@ -615,6 +730,8 @@ getOrRefreshTokenWithReport_args.thrift_spec = (
    (6, TType.I32, 'updateType', None,     6, ),  # 6
    (7, TType.STRING, 'url', 'UTF8', None, ),  # 7
    (8, TType.STRING, 'clients', 'UTF8', None, ),  # 8
    (9, TType.STRUCT, 'airflowLogContext', [pangramia.yt.common.ttypes.AirflowLogContext, None], None, ),  # 9
    (10, TType.STRING, 'requestParamsJson', 'UTF8', None, ),  # 10
 )
@ -712,16 +829,22 @@ class getOrRefreshToken_args(object):
     - url
     - clients
     - machineId
     - airflowLogContext
     - requestParamsJson
     - assignedProxyUrl
    """
-    def __init__(self, accountId=None, updateType=    6, url=None, clients=None, machineId=None,):
+    def __init__(self, accountId=None, updateType=    6, url=None, clients=None, machineId=None, airflowLogContext=None, requestParamsJson=None, assignedProxyUrl=None,):
        self.accountId = accountId
        self.updateType = updateType
        self.url = url
        self.clients = clients
        self.machineId = machineId
        self.airflowLogContext = airflowLogContext
        self.requestParamsJson = requestParamsJson
        self.assignedProxyUrl = assignedProxyUrl
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
@ -757,6 +880,22 @@ class getOrRefreshToken_args(object):
                    self.machineId = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 6:
                if ftype == TType.STRUCT:
                    self.airflowLogContext = pangramia.yt.common.ttypes.AirflowLogContext()
                    self.airflowLogContext.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 7:
                if ftype == TType.STRING:
                    self.requestParamsJson = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 8:
                if ftype == TType.STRING:
                    self.assignedProxyUrl = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
@ -787,6 +926,18 @@ class getOrRefreshToken_args(object):
            oprot.writeFieldBegin('machineId', TType.STRING, 5)
            oprot.writeString(self.machineId.encode('utf-8') if sys.version_info[0] == 2 else self.machineId)
            oprot.writeFieldEnd()
        if self.airflowLogContext is not None:
            oprot.writeFieldBegin('airflowLogContext', TType.STRUCT, 6)
            self.airflowLogContext.write(oprot)
            oprot.writeFieldEnd()
        if self.requestParamsJson is not None:
            oprot.writeFieldBegin('requestParamsJson', TType.STRING, 7)
            oprot.writeString(self.requestParamsJson.encode('utf-8') if sys.version_info[0] == 2 else self.requestParamsJson)
            oprot.writeFieldEnd()
        if self.assignedProxyUrl is not None:
            oprot.writeFieldBegin('assignedProxyUrl', TType.STRING, 8)
            oprot.writeString(self.assignedProxyUrl.encode('utf-8') if sys.version_info[0] == 2 else self.assignedProxyUrl)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
@ -811,6 +962,9 @@ getOrRefreshToken_args.thrift_spec = (
    (3, TType.STRING, 'url', 'UTF8', None, ),  # 3
    (4, TType.STRING, 'clients', 'UTF8', None, ),  # 4
    (5, TType.STRING, 'machineId', 'UTF8', None, ),  # 5
    (6, TType.STRUCT, 'airflowLogContext', [pangramia.yt.common.ttypes.AirflowLogContext, None], None, ),  # 6
    (7, TType.STRING, 'requestParamsJson', 'UTF8', None, ),  # 7
    (8, TType.STRING, 'assignedProxyUrl', 'UTF8', None, ),  # 8
 )
@ -1401,5 +1555,165 @@ reportState_result.thrift_spec = (
    (1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ),  # 1
    (2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ),  # 2
 )
 class getInfoJsonDirect_args(object):
    """
    Attributes:
     - url
     - clients
    """
    def __init__(self, url=None, clients=None,):
        self.url = url
        self.clients = clients
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 1:
                if ftype == TType.STRING:
                    self.url = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRING:
                    self.clients = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString()
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('getInfoJsonDirect_args')
        if self.url is not None:
            oprot.writeFieldBegin('url', TType.STRING, 1)
            oprot.writeString(self.url.encode('utf-8') if sys.version_info[0] == 2 else self.url)
            oprot.writeFieldEnd()
        if self.clients is not None:
            oprot.writeFieldBegin('clients', TType.STRING, 2)
            oprot.writeString(self.clients.encode('utf-8') if sys.version_info[0] == 2 else self.clients)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(getInfoJsonDirect_args)
 getInfoJsonDirect_args.thrift_spec = (
    None,  # 0
    (1, TType.STRING, 'url', 'UTF8', None, ),  # 1
    (2, TType.STRING, 'clients', 'UTF8', None, ),  # 2
 )
 class getInfoJsonDirect_result(object):
    """
    Attributes:
     - success
     - serviceExp
     - userExp
    """
    def __init__(self, success=None, serviceExp=None, userExp=None,):
        self.success = success
        self.serviceExp = serviceExp
        self.userExp = userExp
    def read(self, iprot):
        if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None:
            iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec])
            return
        iprot.readStructBegin()
        while True:
            (fname, ftype, fid) = iprot.readFieldBegin()
            if ftype == TType.STOP:
                break
            if fid == 0:
                if ftype == TType.STRUCT:
                    self.success = pangramia.yt.common.ttypes.JobTokenData()
                    self.success.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 1:
                if ftype == TType.STRUCT:
                    self.serviceExp = pangramia.yt.exceptions.ttypes.PBServiceException.read(iprot)
                else:
                    iprot.skip(ftype)
            elif fid == 2:
                if ftype == TType.STRUCT:
                    self.userExp = pangramia.yt.exceptions.ttypes.PBUserException.read(iprot)
                else:
                    iprot.skip(ftype)
            else:
                iprot.skip(ftype)
            iprot.readFieldEnd()
        iprot.readStructEnd()
    def write(self, oprot):
        if oprot._fast_encode is not None and self.thrift_spec is not None:
            oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
            return
        oprot.writeStructBegin('getInfoJsonDirect_result')
        if self.success is not None:
            oprot.writeFieldBegin('success', TType.STRUCT, 0)
            self.success.write(oprot)
            oprot.writeFieldEnd()
        if self.serviceExp is not None:
            oprot.writeFieldBegin('serviceExp', TType.STRUCT, 1)
            self.serviceExp.write(oprot)
            oprot.writeFieldEnd()
        if self.userExp is not None:
            oprot.writeFieldBegin('userExp', TType.STRUCT, 2)
            self.userExp.write(oprot)
            oprot.writeFieldEnd()
        oprot.writeFieldStop()
        oprot.writeStructEnd()
    def validate(self):
        return
    def __repr__(self):
        L = ['%s=%r' % (key, value)
             for key, value in self.__dict__.items()]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
    def __ne__(self, other):
        return not (self == other)
 all_structs.append(getInfoJsonDirect_result)
 getInfoJsonDirect_result.thrift_spec = (
    (0, TType.STRUCT, 'success', [pangramia.yt.common.ttypes.JobTokenData, None], None, ),  # 0
    (1, TType.STRUCT, 'serviceExp', [pangramia.yt.exceptions.ttypes.PBServiceException, None], None, ),  # 1
    (2, TType.STRUCT, 'userExp', [pangramia.yt.exceptions.ttypes.PBUserException, None], None, ),  # 2
 )
 fix_spec(all_structs)
 del all_structs
--- a/thrift_model/pom.xml
+++ b/thrift_model/pom.xml
@ -7,7 +7,7 @@
    <groupId>com.pangramia.yt</groupId>
    <artifactId>thrift-services</artifactId>
    <!-- This version is for the Thrift API definition. The installable package will use this version. -->
-    <version>3.6.0-SNAPSHOT</version>
+    <version>5.5.0-SNAPSHOT</version>
    <properties>
        <thrift.version>0.16.0</thrift.version>
--- a/thrift_model/services/base_service.thrift
+++ b/thrift_model/services/base_service.thrift
@ -0,0 +1,19 @@
 namespace py pangramia.base_service
 namespace java com.pangramia.base_service
 include "../data/common.thrift"
 include "../data/exceptions.thrift"
 service BaseService {
    // Common health check method
    bool ping() throws (1: exceptions.PBServiceException serviceExp,
                        2: exceptions.PBUserException userExp),
    // Common error reporting
    bool reportError(1: string message, 
                     2: map<string, string> details) throws (1: exceptions.PBServiceException serviceExp,
                                                         2: exceptions.PBUserException userExp)
    // Add this to fix AsyncProcessor issues
    oneway void shutdown()
 }
--- a/thrift_model/services/yt_admin_ops.thrift
+++ b/thrift_model/services/yt_admin_ops.thrift
@ -0,0 +1,63 @@
 namespace py pangramia.yt.admin_ops
 namespace java com.pangramia.yt.admin_ops
 include "../data/common.thrift"
 include "../data/exceptions.thrift"
 include "base_service.thrift"
 //  Proxy and Account management
 service YTAccountsOpService extends base_service.BaseService {
    // AccountPairs                                                
    bool addAccountPair(1: string accountId, 2: string proxyId, 3: string machineId, 4: common.ProxyData proxyData, 5: optional common.AccountData accountData) 
                                            throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),
    common.AccountPairWithState getPair(1: string machineId)
                                            throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),                                                
    bool pair(1: string accountId, 2: string proxyId, 3:string machineId) 
                                            throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),
    bool unpair(1: string accountId, 2: string proxyId, 3:string machineId) 
                                            throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),
    list<common.AccountPairWithState> listAccountPairs(1: optional common.AccountPairState filter) throws (1: exceptions.PBServiceException serviceExp,
                                                                                                           2: exceptions.PBUserException userExp),   
    // ManageAccounts
    bool addAccount(1: string accountId, 2: optional common.AccountData accountData) throws (1: exceptions.PBServiceException serviceExp,
                                                                                             2: exceptions.PBUserException userExp),
    bool suspendAccount(1: string accountId) throws (1: exceptions.PBServiceException serviceExp,
                                                     2: exceptions.PBUserException userExp),
    bool resumeAccount(1: string accountId) throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),     
    bool removeAccount(1: string accountId) throws (1: exceptions.PBServiceException serviceExp,
                                                    2: exceptions.PBUserException userExp),
    list<string> listActiveAccounts() throws (1: exceptions.PBServiceException serviceExp,    
                                              2: exceptions.PBUserException userExp),  
    // ManageProxy
    bool addProxy(1: string proxyId, 2: common.ProxyData proxyData) throws (1: exceptions.PBServiceException serviceExp,
                                                                            2: exceptions.PBUserException userExp),
    bool suspendProxy(1: string proxyId) throws (1: exceptions.PBServiceException serviceExp,
                                                 2: exceptions.PBUserException userExp),
    bool resumeProxy(1: string proxyId) throws (1: exceptions.PBServiceException serviceExp,
                                                 2: exceptions.PBUserException userExp),
    bool removeProxy(1: string proxyId) throws (1: exceptions.PBServiceException serviceExp,    
                                                 2: exceptions.PBUserException userExp),     
    list<string> listActiveProxies() throws (1: exceptions.PBServiceException serviceExp,    
                                             2: exceptions.PBUserException userExp),  
 }
--- a/thrift_model/services/yt_management.thrift
+++ b/thrift_model/services/yt_management.thrift
@ -0,0 +1,27 @@
 namespace py pangramia.yt.management
 namespace java com.pangramia.yt.management
 include "../data/common.thrift"
 include "../data/exceptions.thrift"
 include "base_service.thrift"
 // Service for managing the state of shared resources like proxies and accounts.
 // This service is intended to be run as a single, authoritative instance.
 service YTManagementService extends base_service.BaseService {
    // --- Proxy Management Methods ---
    list<common.ProxyStatus> getProxyStatus(1: optional string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool banProxy(1: string proxyUrl, 2: string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool unbanProxy(1: string proxyUrl, 2: string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool resetAllProxyStatuses(1: string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool banAllProxies(1: string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool deleteProxyFromRedis(1: string proxyUrl, 2: string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    i32 deleteAllProxiesFromRedis(1: optional string serverIdentity) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    // --- Account Management Methods ---
    list<common.AccountStatus> getAccountStatus(1: optional string accountId, 2: optional string accountPrefix) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool banAccount(1: string accountId, 2: optional string reason) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool unbanAccount(1: string accountId, 2: optional string reason) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    bool deleteAccountFromRedis(1: string accountId) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp),
    i32 deleteAllAccountsFromRedis(1: optional string accountPrefix) throws (1: exceptions.PBServiceException serviceExp, 2: exceptions.PBUserException userExp)
 }
--- a/thrift_model/services/yt_tokens_ops.thrift
+++ b/thrift_model/services/yt_tokens_ops.thrift
@ -0,0 +1,50 @@
 namespace py pangramia.yt.tokens_ops
 namespace java com.pangramia.yt.tokens_ops
 include "../data/common.thrift"
 include "../data/exceptions.thrift"
 include "yt_management.thrift"
 // The unified service that combines token operations and management functions.
 // The server implementation will decide which functions are active based on its role.
 service YTTokenOpService extends yt_management.YTManagementService {
    common.JobTokenData      getOrRefreshTokenWithReport ( 1: string accountId,
                                                           2: string oldUrl, 
                                                           3: common.JobState status, 
                                                           4: optional string details, 
                                                           5: optional string jobId,
                                                           6: optional common.TokenUpdateMode updateType = common.TokenUpdateMode.AUTO,
                                                           7: optional string url,
                                                           8: optional string clients,
                                                           9: optional common.AirflowLogContext airflowLogContext,
                                                           10: optional string requestParamsJson) throws (1: exceptions.PBServiceException serviceExp,
                                                                                               2: exceptions.PBUserException userExp)
    common.JobTokenData      getOrRefreshToken ( 1: string accountId,
                                                 2: optional common.TokenUpdateMode updateType = common.TokenUpdateMode.AUTO,
                                                 3: optional string url,
                                                 4: optional string clients,
                                                 5: optional string machineId,
                                                 6: optional common.AirflowLogContext airflowLogContext,
                                                 7: optional string requestParamsJson,
                                                 8: optional string assignedProxyUrl) throws (1: exceptions.PBServiceException serviceExp,
                                                                                       2: exceptions.PBUserException userExp)
    common.JobTokenData      getLatestToken (1: string accountId) throws (1: exceptions.PBServiceException serviceExp,
                                                                          2: exceptions.PBUserException userExp),
    common.JobTokenData      refreshToken ( 1: string accountId,
                                            2: optional common.TokenUpdateMode updateType = common.TokenUpdateMode.AUTO,
                                            3: optional string url ) throws (1: exceptions.PBServiceException serviceExp,
                                                                             2: exceptions.PBUserException userExp)
    bool reportState(  1: string url, 
                       2: common.JobState status, 
                       3: optional string details, 
                       4: optional string jobId) throws (1: exceptions.PBServiceException serviceExp,
                                                         2: exceptions.PBUserException userExp)
    // New method for direct info.json generation, bypassing Node.js token generation.
    common.JobTokenData getInfoJsonDirect(1: string url, 
                                          2: optional string clients) throws (1: exceptions.PBServiceException serviceExp, 
                                                                              2: exceptions.PBUserException userExp)
 }
--- a/tools/generate-inventory.py
+++ b/tools/generate-inventory.py
@ -110,11 +110,15 @@ def generate_group_vars(cluster_config, group_vars_dir):
    # Get master IP for Redis configuration
    master_ip = list(cluster_config['master'].values())[0]['ip']
    # Combine master and worker nodes to create a hostvars-like structure
    all_nodes = {**cluster_config.get('master', {}), **cluster_config.get('workers', {})}
    # Prepare data for YAML dump
    generated_data = {
        'master_host_ip': master_ip,
        'redis_port': 52909,
-        'external_access_ips': external_ips if external_ips else []
+        'external_access_ips': external_ips if external_ips else [],
        'hostvars': all_nodes
    }
    generated_data.update(global_vars)
--- a/tools/sync-to-tower.sh
+++ b/tools/sync-to-tower.sh
@ -1,10 +1,11 @@
 #!/bin/bash
 #
-# Syncs the project directory to a remote "tower" host for deployment orchestration.
+# Syncs the project directory to a remote "jump" host for deployment orchestration.
 #
 # This script is designed to be run from the root of the project directory.
-# It excludes generated files, local data, logs, and other non-essential files
+# It syncs essential project files like source code, DAGs, and Ansible playbooks,
-# to ensure a clean copy of the source code and configuration templates is synced.
+# while excluding generated files, local data, logs, and other non-essential files
 # to ensure a clean copy is deployed.
 set -e # Exit immediately if a command exits with a non-zero status.
 set -u # Treat unset variables as an error.
@ -13,9 +14,9 @@ set -u # Treat unset variables as an error.
 # IMPORTANT: Update these variables to match your environment.
 #
 # The remote host to sync to (e.g., user@hostname)
-REMOTE_HOST="user@your-tower-host.com"
+REMOTE_HOST="alex_p@af-jump"
 # The destination path on the remote host
-REMOTE_PATH="/path/to/your/project"
+REMOTE_PATH="/home/alex_p/yt-ops-services"
 # The root directory of the project on the local machine.
 SOURCE_DIR="."
@ -46,8 +47,11 @@ EXCLUDE_OPTS=(
    "--exclude=airflow/configs/envoy.yaml"
    "--exclude=airflow/configs/docker-compose.camoufox.yaml"
    "--exclude=airflow/configs/camoufox_endpoints.json"
    "--exclude=cluster*.yml"
    # Exclude local development notes
    "--exclude=TODO-*.md"
    # Exclude user-specific tools
    "--exclude=*aider*"
 )
 # The rsync command:
@ -55,7 +59,9 @@ EXCLUDE_OPTS=(
 # -v: verbose
 # -z: compress file data during the transfer
 # --delete: delete extraneous files from the destination directory
-rsync -avz --delete \
+# --partial: keep partially transferred files
 # --progress: show progress during transfer
 rsync -avz --delete --partial --progress \
    "${EXCLUDE_OPTS[@]}" \
    "$SOURCE_DIR/" \
    "$REMOTE_HOST:$REMOTE_PATH/"
--- a/yt_ops_services/pycache/init.cpython-39.pyc
+++ b/yt_ops_services/pycache/init.cpython-39.pyc
--- a/yt_ops_services/pycache/client_utils.cpython-39.pyc
+++ b/yt_ops_services/pycache/client_utils.cpython-39.pyc
--- a/yt_ops_services/pycache/version.cpython-39.pyc
+++ b/yt_ops_services/pycache/version.cpython-39.pyc
		`@ -0,0 +1 @@`
							`__all__ = ['ttypes', 'constants', 'BaseService']`
		`@ -0,0 +1 @@`
							`__all__ = ['ttypes', 'constants', 'YTManagementService']`
		`@ -0,0 +1 @@`
							`__all__ = ['ttypes', 'constants', 'YTTokenOpService']`