yt-dlp-dags/dags/ytdlp_ops_orchestrator.py

# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.

"""
DAG to orchestrate ytdlp_ops_worker_per_url DAG runs based on a defined policy.
It fetches URLs from a Redis queue and launches workers in controlled bunches.
"""

from airflow import DAG
from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.operators.python import PythonOperator
from airflow.models.param import Param
from airflow.models.variable import Variable
from airflow.utils.dates import days_ago
from airflow.api.common.trigger_dag import trigger_dag
from airflow.models.dagrun import DagRun
from airflow.models.dag import DagModel
from datetime import timedelta
import logging
import random
import time

# Import utility functions
from utils.redis_utils import _get_redis_client

# Import Thrift modules for proxy status check
from pangramia.yt.tokens_ops import YTTokenOpService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport

# Configure logging
logger = logging.getLogger(__name__)

# Default settings
DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_TOTAL_WORKERS = 3
DEFAULT_WORKERS_PER_BUNCH = 1
DEFAULT_WORKER_DELAY_S = 5
DEFAULT_BUNCH_DELAY_S = 20

DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="16.162.82.212")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)

# --- Helper Functions ---


# --- Main Orchestration Callable ---

def orchestrate_workers_ignition_callable(**context):
    """
    Main orchestration logic. Triggers a specified number of worker DAGs
    to initiate self-sustaining processing loops.
    """
    params = context['params']
    logger.info("Starting worker ignition sequence.")

    worker_dag_id = 'ytdlp_ops_worker_per_url'
    dag_model = DagModel.get_dagmodel(worker_dag_id)
    if dag_model and dag_model.is_paused:
        raise AirflowException(f"Worker DAG '{worker_dag_id}' is paused. Cannot start worker loops.")

    total_workers = int(params['total_workers'])
    workers_per_bunch = int(params['workers_per_bunch'])
    worker_delay = int(params['delay_between_workers_s'])
    bunch_delay = int(params['delay_between_bunches_s'])

    # Create a list of worker numbers to trigger
    worker_indices = list(range(total_workers))
    bunches = [worker_indices[i:i + workers_per_bunch] for i in range(0, len(worker_indices), workers_per_bunch)]

    # Get and parse worker hosts (which are used as queue names)
    worker_hosts_str = params.get('worker_hosts', 'celery@dl002')
    worker_hosts = [h.strip() for h in worker_hosts_str.split(',') if h.strip()]
    if not worker_hosts:
        raise AirflowException("The 'worker_hosts' parameter cannot be empty.")

    logger.info(f"Plan: Starting {total_workers} total workers in {len(bunches)} bunches, distributing across hosts (queues): {worker_hosts}")

    dag_run_id = context['dag_run'].run_id
    total_triggered = 0

    for i, bunch in enumerate(bunches):
        logger.info(f"--- Igniting Bunch {i+1}/{len(bunches)} (contains {len(bunch)} worker(s)) ---")
        for j, _ in enumerate(bunch):
            # Create a unique run_id for each worker loop starter
            run_id = f"ignited_{dag_run_id}_{total_triggered}"

            # Pass all orchestrator params to the worker so it has the full context for its loop.
            conf_to_pass = {p: params[p] for p in params}
            # The worker pulls its own URL, so we don't pass one.
            if 'url' in conf_to_pass:
                del conf_to_pass['url']

            # Assign host/queue in a round-robin fashion
            queue_for_worker = worker_hosts[total_triggered % len(worker_hosts)]
            conf_to_pass['queue'] = queue_for_worker

            logger.info(f"Igniting worker {j+1}/{len(bunch)} in bunch {i+1} (loop {total_triggered + 1}/{total_workers}) on host (queue) '{queue_for_worker}' (Run ID: {run_id})")
            logger.debug(f"Full conf for worker loop {run_id}: {conf_to_pass}")

            trigger_dag(
                dag_id=worker_dag_id,
                run_id=run_id,
                conf=conf_to_pass,
                replace_microseconds=False
            )
            total_triggered += 1

            # Delay between workers in a bunch
            if j < len(bunch) - 1:
                logger.info(f"Waiting {worker_delay}s before next worker in bunch...")
                time.sleep(worker_delay)

        # Delay between bunches
        if i < len(bunches) - 1:
            logger.info(f"--- Bunch {i+1} ignited. Waiting {bunch_delay}s before next bunch... ---")
            time.sleep(bunch_delay)

    logger.info(f"--- Ignition sequence complete. Total worker loops started: {total_triggered}. ---")


# =============================================================================
# DAG Definition
# =============================================================================

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'start_date': days_ago(1),
}

with DAG(
    dag_id='ytdlp_ops_orchestrator',
    default_args=default_args,
    schedule_interval=None, # This DAG runs only when triggered.
    max_active_runs=1, # Only one ignition process should run at a time.
    catchup=False,
    description='Ignition system for ytdlp_ops_worker_per_url DAGs. Starts self-sustaining worker loops.',
    doc_md="""
    ### YT-DLP Worker Ignition System

    This DAG acts as an "ignition system" to start one or more self-sustaining worker loops.
    It does **not** process URLs itself. Its only job is to trigger a specified number of `ytdlp_ops_worker_per_url` DAGs.

    #### How it Works:

    1.  **Manual Trigger:** You manually trigger this DAG with parameters defining how many worker loops to start (`total_workers`), in what configuration (`workers_per_bunch`, delays).
    2.  **Ignition:** The orchestrator triggers the initial set of worker DAGs in a "fire-and-forget" manner, passing all its configuration parameters to them.
    3.  **Completion:** Once all initial workers have been triggered, the orchestrator's job is complete.

    The workers then take over, each running its own continuous processing loop.
    """,
    tags=['ytdlp', 'mgmt', 'master'],
    params={
        # --- Ignition Control Parameters ---
        'total_workers': Param(DEFAULT_TOTAL_WORKERS, type="integer", description="Total number of worker loops to start."),
        'workers_per_bunch': Param(DEFAULT_WORKERS_PER_BUNCH, type="integer", description="Number of workers to start in each bunch."),
        'delay_between_workers_s': Param(DEFAULT_WORKER_DELAY_S, type="integer", description="Delay in seconds between starting each worker within a bunch."),
        'delay_between_bunches_s': Param(DEFAULT_BUNCH_DELAY_S, type="integer", description="Delay in seconds between starting each bunch."),

        # --- Worker Passthrough Parameters ---
        'worker_hosts': Param('celery@dl002', type="string", title="[Worker Param] Worker Hosts", description="Comma-separated list of Celery worker hostnames (e.g., 'celery@dl002') to distribute workers to. These are used as queue names. Workers will be assigned to these queues in a round-robin fashion."),
        'on_bannable_failure': Param(
            'retry_with_new_account',
            type="string",
            enum=['stop_loop', 'retry_with_new_account'],
            title="[Worker Param] On Bannable Failure Policy",
            description="Policy for a worker when a bannable error occurs. "
                        "'stop_loop': Ban the account, mark URL as failed, and stop the worker's loop. "
                        "'retry_with_new_account': Ban the failed account, retry ONCE with a new account. If retry fails, ban the second account and proxy, then stop."
        ),
        'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="[Worker Param] Base name for Redis queues."),
        'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="[Worker Param] Airflow Redis connection ID."),
        'clients': Param('mweb,ios,android', type="string", description="[Worker Param] Comma-separated list of clients for token generation."),
        'account_pool': Param('ytdlp_account', type="string", description="[Worker Param] Account pool prefix or comma-separated list."),
        'account_pool_size': Param(10, type=["integer", "null"], description="[Worker Param] If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
        'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="[Worker Param] IP of the ytdlp-ops-server. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),
        'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="[Worker Param] Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."),
        'machine_id': Param("ytdlp-ops-airflow-service", type="string", description="[Worker Param] Identifier for the client machine."),
        'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="[Worker Param] If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."),
        'retrigger_delay_on_empty_s': Param(60, type="integer", description="[Worker Param] Delay in seconds before a worker re-triggers itself if the queue is empty. Set to -1 to stop the loop."),
    }
) as dag:

    orchestrate_task = PythonOperator(
        task_id='start_worker_loops',
        python_callable=orchestrate_workers_ignition_callable,
    )
    orchestrate_task.doc_md = """
    ### Start Worker Loops
    This is the main task that executes the ignition policy.
    - It triggers `ytdlp_ops_worker_per_url` DAGs according to the batch settings.
    - It passes all its parameters down to the workers, which will use them to run their continuous loops.
    """