# -*- coding: utf-8 -*- # vim:fenc=utf-8 # # Copyright © 2024 rl # # Distributed under terms of the MIT license. """ DAG for processing a single YouTube URL passed via DAG run configuration. This is the "Worker" part of a Sensor/Worker pattern. """ from airflow import DAG from airflow.exceptions import AirflowException, AirflowSkipException from airflow.models import BaseOperator, Variable from airflow.models.param import Param from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator from airflow.operators.trigger_dagrun import TriggerDagRunOperator from airflow.providers.redis.hooks.redis import RedisHook from airflow.utils.dates import days_ago from airflow.utils.decorators import apply_defaults from datetime import datetime, timedelta from pangramia.yt.common.ttypes import TokenUpdateMode from pangramia.yt.exceptions.ttypes import PBServiceException from pangramia.yt.tokens_ops import YTTokenOpService from thrift.protocol import TBinaryProtocol from thrift.transport import TSocket, TTransport from thrift.transport.TTransport import TTransportException import json import logging import os import redis import socket import time import traceback # Import utility functions from utils.redis_utils import _get_redis_client # Configure logging logger = logging.getLogger(__name__) # Default settings DEFAULT_QUEUE_NAME = 'video_queue' DEFAULT_REDIS_CONN_ID = 'redis_default' DEFAULT_MAX_URLS = 1 DEFAULT_TIMEOUT = 30 # Default Thrift timeout in seconds # --- Helper Functions --- def _extract_video_id(url): """Extracts YouTube video ID from URL.""" if not url or not isinstance(url, str): logger.debug("URL is empty or not a string, cannot extract video ID.") return None try: video_id = None if 'youtube.com/watch?v=' in url: video_id = url.split('v=')[1].split('&')[0] elif 'youtu.be/' in url: video_id = url.split('youtu.be/')[1].split('?')[0] if video_id and len(video_id) >= 11: video_id = video_id[:11] # Standard ID length logger.debug(f"Extracted video ID '{video_id}' from URL: {url}") return video_id else: logger.debug(f"Could not extract a standard video ID pattern from URL: {url}") return None except Exception as e: logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}") return None # --- Queue Management Callables (for success/failure reporting) --- def mark_proxy_banned_callable(**context): """Makes a Thrift call to ban a proxy if the get_token task failed with a bannable error.""" ti = context['task_instance'] proxy_to_ban = ti.xcom_pull(task_ids='get_token', key='proxy_to_ban') if not proxy_to_ban: logger.info("No proxy to ban was pushed to XCom. Skipping task.") raise AirflowSkipException("No proxy to ban was identified in the upstream failure.") server_identity = ti.xcom_pull(task_ids='get_token', key='server_identity_for_ban') host = ti.xcom_pull(task_ids='get_token', key='service_host_for_ban') port = ti.xcom_pull(task_ids='get_token', key='service_port_for_ban') if not all([server_identity, host, port]): logger.error("Missing connection details (identity, host, or port) from XCom. Cannot ban proxy.") raise AirflowException("Missing connection details to ban proxy.") logger.warning(f"Attempting to ban proxy '{proxy_to_ban}' for server '{server_identity}' at {host}:{port}.") transport = None try: socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) socket_conn.setTimeout(15 * 1000) # 15s timeout for ban call transport = TTransport.TFramedTransport(socket_conn) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = YTTokenOpService.Client(protocol) transport.open() client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity) logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.") except Exception as ban_exc: logger.error(f"Failed to send ban request for proxy '{proxy_to_ban}': {ban_exc}", exc_info=True) # We should fail the task if the ban call fails, as it's an important side-effect. raise AirflowException(f"Failed to ban proxy: {ban_exc}") finally: if transport and transport.isOpen(): transport.close() def mark_url_as_success(**context): """Moves URL from progress to result hash on success.""" ti = context['task_instance'] params = context['params'] url = params.get('url') # Get URL from params, not XCom if not url: logger.warning("mark_url_as_success called but no URL found in DAG run parameters.") return queue_name = params['queue_name'] result_queue = f"{queue_name}_result" redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) # Pull results from previous tasks info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe') logger.info(f"Handling success for URL: {url}") logger.info(f" Downloaded File Path: {downloaded_file_path}") result_data = { 'status': 'success', 'end_time': time.time(), 'info_json_path': info_json_path, 'socks_proxy': socks_proxy, 'ytdlp_command': ytdlp_command, 'downloaded_file_path': downloaded_file_path, 'url': url, 'dag_run_id': context['dag_run'].run_id, } try: # In the worker pattern, there's no "progress" hash to remove from. # We just add the result to the success hash. client = _get_redis_client(redis_conn_id) client.hset(result_queue, url, json.dumps(result_data)) logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.") except Exception as e: logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True) # Log error but don't fail the task, as the main work succeeded. def mark_url_as_failed(**context): """ Handles failed processing. Records detailed error information to the fail hash and, if stop_on_failure is True, fails the task to make the DAG run failure visible. """ ti = context['task_instance'] params = context['params'] url = params.get('url') # Get URL from params if not url: logger.error("mark_url_as_failed called but no URL found in DAG run parameters.") return queue_name = params['queue_name'] fail_queue = f"{queue_name}_fail" inbox_queue = f"{queue_name}_inbox" redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID) stop_on_failure = params.get('stop_on_failure', True) # Determine if we should requeue based on various parameters should_requeue = params.get('requeue_on_failure', False) requeue_on_bannable_error = params.get('requeue_on_bannable_error', False) requeue_on_ffprobe_failure = params.get('requeue_on_ffprobe_failure', False) # --- Extract Detailed Error Information --- exception = context.get('exception') # Find the specific task that failed to pull its XComs dag_run = context['dag_run'] failed_task_id = "unknown" upstream_tasks = ti.task.get_direct_relatives(upstream=True) for task in upstream_tasks: upstream_ti = dag_run.get_task_instance(task_id=task.task_id) if upstream_ti and upstream_ti.state == 'failed': failed_task_id = task.task_id break error_details = None if failed_task_id != "unknown": error_details = ti.xcom_pull(task_ids=failed_task_id, key='error_details') if error_details: error_message = error_details.get('error_message', 'Unknown error from XCom') error_type = error_details.get('error_type', 'Unknown type from XCom') tb_str = error_details.get('traceback', 'No traceback in XCom.') else: error_message = str(exception) if exception else "Unknown error" error_type = type(exception).__name__ if exception else "Unknown" tb_str = "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available." logger.info(f"Handling failure for URL: {url}") logger.error(f" Failed Task: {failed_task_id}") logger.error(f" Failure Type: {error_type}") logger.error(f" Failure Reason: {error_message}") logger.debug(f" Traceback:\n{tb_str}") # --- Check for specific requeue conditions --- if not should_requeue: # Only check specific conditions if the general one is false if requeue_on_bannable_error and isinstance(exception, PBServiceException): bannable_error_codes = [ "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT" ] if hasattr(exception, 'errorCode') and exception.errorCode in bannable_error_codes: should_requeue = True logger.info(f"Bannable error '{exception.errorCode}' detected. Re-queuing URL as per 'requeue_on_bannable_error' param.") if requeue_on_ffprobe_failure and isinstance(exception, AirflowException) and "Bash command failed" in str(exception): # Check for the specific exit code for probe failure if "exit code 2" in str(exception): should_requeue = True logger.info("Probe failure detected (exit code 2). Re-queuing URL as per 'requeue_on_ffprobe_failure' param.") try: client = _get_redis_client(redis_conn_id) if should_requeue: client.rpush(inbox_queue, url) logger.info(f"Re-queued failed URL '{url}' to inbox '{inbox_queue}' for retry.") else: fail_data = { 'status': 'failed', 'end_time': time.time(), 'failed_task': failed_task_id, 'error_type': error_type, 'error_message': error_message, 'traceback': tb_str, 'url': url, 'dag_run_id': context['dag_run'].run_id, } client.hset(fail_queue, url, json.dumps(fail_data, indent=2)) logger.info(f"Stored detailed failure info for URL '{url}' in fail hash '{fail_queue}'.") except Exception as e: logger.error(f"Critical error during failure handling in Redis for URL '{url}': {e}", exc_info=True) # This is a critical error in the failure handling logic itself. raise AirflowException(f"Could not handle failure in Redis: {e}") # If stop_on_failure is True, we should fail this task to make the DAG run fail. # The loop is already stopped by the DAG structure, but this makes the failure visible. if stop_on_failure: logger.error("stop_on_failure is True. Failing this task to mark the DAG run as failed.") # Re-raise the original exception to fail the task instance. if exception: raise exception else: # If we got details from XCom, we don't have the original exception object. # So, we raise a new AirflowException with the details we have. raise AirflowException(f"Failing task as per stop_on_failure=True. Upstream error: [{error_type}] {error_message}") # --- YtdlpOpsOperator --- class YtdlpOpsOperator(BaseOperator): """ Custom Airflow operator to interact with YTDLP Thrift service. Processes a single URL passed via DAG run configuration. """ template_fields = ('service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir') @apply_defaults def __init__(self, service_ip=None, service_port=None, account_id=None, info_json_dir=None, timeout=DEFAULT_TIMEOUT, *args, **kwargs): super().__init__(*args, **kwargs) logger.info(f"Initializing YtdlpOpsOperator (Worker Version) with parameters: " f"service_ip={service_ip}, service_port={service_port}, " f"account_id={account_id}, info_json_dir={info_json_dir}, timeout={timeout}") if not service_ip or not service_port: raise ValueError("Both service_ip and service_port must be specified.") if not account_id: logger.warning("No account_id provided. Ensure it's set in DAG params or operator config.") self.service_ip = service_ip self.service_port = service_port self.account_id = account_id self.info_json_dir = info_json_dir self.timeout = timeout def execute(self, context): logger.info("Executing YtdlpOpsOperator (Worker Version)") transport = None ti = context['task_instance'] # Define connection parameters outside the try block to be available in except blocks params = context['params'] url = params.get('url') if not url: raise AirflowException("DAG was triggered without a 'url' in its configuration.") service_ip = self.render_template(self.service_ip, context) service_port_rendered = self.render_template(self.service_port, context) account_id = self.render_template(self.account_id, context) timeout_rendered = self.render_template(self.timeout, context) info_json_dir = self.render_template(self.info_json_dir, context) host = params.get('service_ip', service_ip) port_str = params.get('service_port', service_port_rendered) account_id = params.get('account_id', account_id) clients = params.get('clients') logger.info(f"Using direct connection settings: service_ip={host}, service_port={port_str}") if not host or not port_str: raise ValueError("Direct connection requires service_ip and service_port") try: port = int(port_str) except (ValueError, TypeError): raise ValueError(f"Invalid service_port value: {port_str}") try: timeout = int(timeout_rendered) if timeout <= 0: raise ValueError("Timeout must be positive") except (ValueError, TypeError): timeout = DEFAULT_TIMEOUT try: logger.info(f"Processing URL from DAG run config: {url}") socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) socket_conn.setTimeout(timeout * 1000) transport = TTransport.TFramedTransport(socket_conn) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = YTTokenOpService.Client(protocol) transport.open() logger.info("Successfully connected to Thrift server.") client.ping() logger.info("Server ping successful.") token_data = client.getOrRefreshToken( accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=clients ) logger.info("Successfully retrieved token data from service.") info_json_path = None info_json = self._get_info_json(token_data) if info_json and self._is_valid_json(info_json): info_json_path = self._save_info_json(context, info_json, url, account_id, info_json_dir) if info_json_path: ti.xcom_push(key='info_json_path', value=info_json_path) else: ti.xcom_push(key='info_json_path', value=None) else: ti.xcom_push(key='info_json_path', value=None) socks_proxy = None proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) if proxy_attr: socks_proxy = getattr(token_data, proxy_attr) ti.xcom_push(key='socks_proxy', value=socks_proxy) ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None) ti.xcom_push(key='ytdlp_command', value=ytdlp_cmd) except (PBServiceException, TTransportException) as e: # Enhanced logging to make failures clear in Airflow logs. logger.error(f"Thrift call failed for URL '{url}' with account '{account_id}'.") logger.error(f"Exception Type: {type(e).__name__}") logger.error(f"Exception Message: {getattr(e, 'message', str(e))}") if isinstance(e, PBServiceException): logger.error(f"Service Error Code: {getattr(e, 'errorCode', 'N/A')}") if hasattr(e, 'context') and e.context: logger.error(f"Service Context: {e.context}") # Use exc_info=True to get the full traceback in the logs logger.error("Full exception traceback:", exc_info=True) # Push exception details to XCom for the failure handler error_details = { 'error_message': getattr(e, 'message', str(e)), 'error_type': type(e).__name__, 'traceback': traceback.format_exc() } ti.xcom_push(key='error_details', value=error_details) proxy_to_ban = None if isinstance(e, PBServiceException) and hasattr(e, 'context') and e.context: # Assuming server adds 'proxy_url' to context on failure proxy_to_ban = e.context.get('proxy_url') bannable_error_codes = [ "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "SOCKS5_CONNECTION_FAILED", "CLIENT_TIMEOUT", "GLOBAL_TIMEOUT" ] if e.errorCode not in bannable_error_codes: proxy_to_ban = None if proxy_to_ban: logger.info(f"Found proxy to ban: {proxy_to_ban}. Pushing to XCom for 'mark_proxy_banned' task.") ti.xcom_push(key='proxy_to_ban', value=proxy_to_ban) ti.xcom_push(key='server_identity_for_ban', value=account_id) ti.xcom_push(key='service_host_for_ban', value=host) ti.xcom_push(key='service_port_for_ban', value=port) else: logger.info("No specific proxy to ban based on the error context.") # Push None explicitly so the downstream task knows not to run ti.xcom_push(key='proxy_to_ban', value=None) # Re-raise the original exception to fail the Airflow task raise e except Exception as e: logger.error(f"YtdlpOpsOperator (Worker) failed with an unexpected exception: {e}", exc_info=True) raise AirflowException(f"Task failed with unexpected error: {e}") finally: if transport and transport.isOpen(): transport.close() def _get_info_json(self, token_data): return getattr(token_data, 'infoJson', None) def _is_valid_json(self, json_str): if not json_str or not isinstance(json_str, str): return False try: json.loads(json_str) return True except json.JSONDecodeError: return False def _save_info_json(self, context, info_json, url, account_id, rendered_info_json_dir): try: video_id = _extract_video_id(url) save_dir = rendered_info_json_dir or "." os.makedirs(save_dir, exist_ok=True) timestamp = int(time.time()) base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json" info_json_path = os.path.join(save_dir, base_filename) with open(info_json_path, 'w', encoding='utf-8') as f: f.write(info_json) return info_json_path except Exception as e: logger.error(f"Failed to save info.json: {e}", exc_info=True) return None # ============================================================================= # DAG Definition # ============================================================================= default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=1), 'start_date': days_ago(1), } with DAG( dag_id='ytdlp_ops_worker_per_url', default_args=default_args, schedule_interval=None, catchup=False, description='Processes a single YouTube URL passed via configuration.', tags=['ytdlp', 'thrift', 'client', 'worker'], params={ 'url': Param(None, type=["string", "null"], description="The YouTube URL to process. This is set by the triggering DAG."), # Sensor params (passed through to re-trigger the sensor, with defaults for standalone runs) 'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Sensor param: Base name for Redis queues."), 'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Sensor param: Airflow Redis connection ID."), 'max_urls_per_run': Param(DEFAULT_MAX_URLS, type="integer", description="Sensor param: Maximum number of URLs to process in one batch."), # Worker-specific params 'service_ip': Param('89.253.221.173', type="string", description="Service IP."), 'service_port': Param(9090, type="integer", description="Service port."), 'account_id': Param('default_account', type="string", description="Account ID for the API call."), 'clients': Param('ios', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."), 'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."), 'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."), 'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."), 'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json."), 'requeue_on_failure': Param(False, type="boolean", description="If True, re-adds the URL to the inbox on failure instead of moving to the fail hash."), 'stop_on_failure': Param(True, type="boolean", description="If True, a worker failure will stop the entire processing loop."), 'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."), 'requeue_on_bannable_error': Param(False, type="boolean", description="If True, re-queues the URL if a bannable error (proxy, bot detection) occurs."), 'requeue_on_ffprobe_failure': Param(False, type="boolean", description="If True, re-queues the URL if the ffmpeg/ffprobe check fails."), } ) as dag: get_token = YtdlpOpsOperator( task_id='get_token', service_ip="{{ params.service_ip }}", service_port="{{ params.service_port }}", account_id="{{ params.account_id }}", timeout="{{ params.timeout }}", info_json_dir="{{ params.info_json_dir }}", ) download_and_probe = BashOperator( task_id='download_and_probe', bash_command=""" set -e INFO_JSON_PATH="{{ ti.xcom_pull(task_ids='get_token', key='info_json_path') }}" PROXY="{{ ti.xcom_pull(task_ids='get_token', key='socks_proxy') }}" FORMAT="{{ params.download_format }}" DOWNLOAD_DIR="{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video') }}" FILENAME_TEMPLATE="{{ params.output_path_template }}" FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE" echo "--- Starting Download Step ---" echo "Info JSON Path: $INFO_JSON_PATH" echo "Proxy: $PROXY" echo "Format: $FORMAT" echo "Download Directory: $DOWNLOAD_DIR" echo "Full Output Path: $FULL_OUTPUT_PATH" if [ -z "$INFO_JSON_PATH" ] || [ ! -f "$INFO_JSON_PATH" ]; then echo "Error: info.json path is missing or file does not exist ($INFO_JSON_PATH)." exit 1 fi CMD_ARRAY=(yt-dlp --load-info-json "$INFO_JSON_PATH") if [ -n "$PROXY" ]; then CMD_ARRAY+=(--proxy "$PROXY") fi CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename) CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist) echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")" FINAL_FILENAME=$("${CMD_ARRAY[@]}") EXIT_CODE=$? echo "yt-dlp exited with code: $EXIT_CODE" if [ $EXIT_CODE -ne 0 ]; then echo "Error: yt-dlp command failed." exit $EXIT_CODE fi if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then echo "Error: Download failed or did not produce a file." exit 1 fi echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME" echo "--- Starting Probe Step ---" echo "Probing downloaded file: $FINAL_FILENAME" if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt." if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then echo "Attempting one retry on probe failure..." echo "Renaming to .part to attempt resuming download." mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part" # Re-run download command echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")" FINAL_FILENAME=$("${CMD_ARRAY[@]}") EXIT_CODE=$? echo "yt-dlp retry exited with code: $EXIT_CODE" if [ $EXIT_CODE -ne 0 ]; then echo "Error: yt-dlp retry command failed." exit $EXIT_CODE fi if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then echo "Error: Retry download failed or did not produce a file." exit 1 fi echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME" # Re-probe echo "Probing redownloaded file: $FINAL_FILENAME" if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2." exit 2 fi else echo "Failing with exit code 2 due to probe failure (retries disabled)." exit 2 fi fi echo "SUCCESS: Probe confirmed valid media file." # Push the final filename for the success_task echo "$FINAL_FILENAME" """, retries=0, # Retries are now handled inside the script based on a DAG param retry_delay=timedelta(minutes=1), ) mark_proxy_banned = PythonOperator( task_id='mark_proxy_banned', python_callable=mark_proxy_banned_callable, trigger_rule='one_failed', # Run only if get_token fails ) # This task triggers the sensor DAG to check for more work as soon as this worker is done. trigger_sensor_for_next_batch = TriggerDagRunOperator( task_id='trigger_sensor_for_next_batch', trigger_dag_id='ytdlp_ops_sensor_queue', # Pass only the sensor's needed parameters back to it. # These values were originally passed from the sensor to this worker. # The values are templated and will be passed as strings to the triggered DAG. conf={ "queue_name": "{{ params.queue_name }}", "redis_conn_id": "{{ params.redis_conn_id }}", "max_urls_per_run": "{{ params.max_urls_per_run }}", }, # This task will only run on the success path, so it inherits the default # trigger_rule='all_success'. wait_for_completion=False, ) trigger_sensor_for_next_batch.doc_md = """ ### Trigger Sensor for Next Batch Triggers a new run of the `ytdlp_sensor_redis_queue` DAG to create a continuous processing loop. This task **only runs on the success path** after a URL has been fully processed. This ensures that the system immediately checks for more URLs to process, but stops the loop on failure. """ # Define success and failure handling tasks success_task = PythonOperator( task_id='mark_url_as_success', python_callable=mark_url_as_success, trigger_rule='all_success', # Run only if upstream tasks succeeded ) failure_task = PythonOperator( task_id='mark_url_as_failed', python_callable=mark_url_as_failed, trigger_rule='one_failed', # Run if any upstream task failed ) # --- Define Task Dependencies --- # The main success flow get_token >> download_and_probe >> success_task >> trigger_sensor_for_next_batch # The failure path for get_token, which includes the explicit ban task get_token >> mark_proxy_banned # The main failure handler, which listens to the primary tasks. # If get_token or download_and_probe fails, it will trigger failure_task. [get_token, download_and_probe] >> failure_task