yt-dlp-dags/airflow/dags-backup/old-download-dag.py
2025-08-26 18:00:55 +03:00

1167 lines
57 KiB
Python

# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2024 rl <rl@rlmbp>
#
# Distributed under terms of the MIT license.
"""
DAG for processing a single YouTube URL passed via DAG run configuration.
This is the "Worker" part of a Sensor/Worker pattern.
"""
from airflow import DAG
from airflow.exceptions import AirflowException, AirflowSkipException
from airflow.models import BaseOperator, Variable
from airflow.models.param import Param
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.utils.dates import days_ago
from airflow.utils.decorators import apply_defaults
from airflow.utils.task_group import TaskGroup
from datetime import datetime, timedelta
from airflow.api.common.trigger_dag import trigger_dag
from pangramia.yt.common.ttypes import TokenUpdateMode
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
from pangramia.yt.tokens_ops import YTTokenOpService
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from thrift.transport.TTransport import TTransportException
import json
import logging
import os
import random
import redis
import socket
import time
import traceback
import inspect
import uuid
import uuid
# Import utility functions
from utils.redis_utils import _get_redis_client
# Configure logging
logger = logging.getLogger(__name__)
# Default settings
DEFAULT_QUEUE_NAME = 'video_queue'
DEFAULT_REDIS_CONN_ID = 'redis_default'
DEFAULT_MAX_URLS = 1
DEFAULT_TIMEOUT = 180 # Default Thrift timeout in seconds
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="16.162.82.212")
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
# --- Helper Functions ---
def _get_thrift_client(host, port, timeout):
"""Helper to create and connect a Thrift client."""
transport = TSocket.TSocket(host, port)
transport.setTimeout(timeout * 1000)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
transport.open()
logger.info(f"Connected to Thrift server at {host}:{port}")
return client, transport
def _extract_video_id(url):
"""Extracts YouTube video ID from URL."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# --- Queue Management Callables (for success/failure reporting) ---
def mark_url_as_success(**context):
"""Moves URL from progress to result hash on success."""
ti = context['task_instance']
params = context['params']
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
if not url:
logger.warning("mark_url_as_success called but no URL found in DAG run parameters.")
return
queue_name = params['queue_name']
result_queue = f"{queue_name}_result"
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
# Pull results from previous tasks
info_json_path = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') or \
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') or \
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='ytdlp_command') or \
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='ytdlp_command')
downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe')
logger.info(f"Handling success for URL: {url}")
logger.info(f" Downloaded File Path: {downloaded_file_path}")
result_data = {
'status': 'success',
'end_time': time.time(),
'info_json_path': info_json_path,
'socks_proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'downloaded_file_path': downloaded_file_path,
'url': url,
'dag_run_id': context['dag_run'].run_id,
}
try:
# In the worker pattern, there's no "progress" hash to remove from.
# We just add the result to the success hash.
client = _get_redis_client(redis_conn_id)
client.hset(result_queue, url, json.dumps(result_data))
logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.")
except Exception as e:
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
# Log error but don't fail the task, as the main work succeeded.
def handle_failure_callable(**context):
"""
Handles a failed processing run by recording rich, detailed error information to Redis.
"""
ti = context['task_instance']
params = context['params']
dag_run = context['dag_run']
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
if not url:
# This can happen if pull_url_and_assign_account itself fails.
# We can't record a URL-specific failure, but we should log it.
failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed']
failed_task_ids = [ti.task_id for ti in failed_tis]
logger.error(f"handle_failure_callable was triggered for run {dag_run.run_id}, but no URL was found in XCom. "
f"This likely means an early task failed. Failed tasks in run: {failed_task_ids}")
return
# --- Start building the rich error report ---
failure_report = {
'url': url,
'dag_run_id': dag_run.run_id,
'failure_timestamp': datetime.now().isoformat(),
'failed_task': 'unknown',
'failure_summary': 'An unknown error occurred.',
'failure_history': [],
'download_error': None,
'generic_error': None
}
# --- Gather data from token acquisition attempts ---
# Attempt 1: get_token
get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.get_token')
if get_token_ti:
error_details_1 = ti.xcom_pull(task_ids=get_token_ti.task_id, key='error_details')
account_1 = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
attempt_1_report = {
'task_id': get_token_ti.task_id,
'account_id': account_1,
'status': get_token_ti.state,
'start_date': get_token_ti.start_date.isoformat() if get_token_ti.start_date else None,
'end_date': get_token_ti.end_date.isoformat() if get_token_ti.end_date else None,
}
if error_details_1:
attempt_1_report.update({
'proxy_url': error_details_1.get('proxy_url'),
'error_code': error_details_1.get('error_code'),
'error_message': error_details_1.get('error_message'),
})
failure_report['failure_history'].append(attempt_1_report)
# Attempt 2: retry_get_token
retry_get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.retry_get_token')
# Only report on retry if it actually ran
if retry_get_token_ti and retry_get_token_ti.state:
error_details_2 = ti.xcom_pull(task_ids=retry_get_token_ti.task_id, key='error_details')
account_2 = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id')
attempt_2_report = {
'task_id': retry_get_token_ti.task_id,
'account_id': account_2,
'status': retry_get_token_ti.state,
'start_date': retry_get_token_ti.start_date.isoformat() if retry_get_token_ti.start_date else None,
'end_date': retry_get_token_ti.end_date.isoformat() if retry_get_token_ti.end_date else None,
}
if error_details_2:
attempt_2_report.update({
'proxy_url': error_details_2.get('proxy_url'),
'error_code': error_details_2.get('error_code'),
'error_message': error_details_2.get('error_message'),
})
failure_report['failure_history'].append(attempt_2_report)
# --- Identify the primary failure point ---
exception = context.get('exception')
# Case 1: Download & Probe failure
download_probe_ti = dag_run.get_task_instance('download_and_probe')
if download_probe_ti and download_probe_ti.state == 'failed':
failure_report['failed_task'] = download_probe_ti.task_id
failure_report['failure_summary'] = 'Download or probe failed after successful token acquisition.'
failure_report['download_error'] = {
'error_message': str(exception) if exception else "BashOperator failed. Check task logs for yt-dlp/ffmpeg output.",
'error_type': type(exception).__name__ if exception else "Unknown",
}
# Case 2: Token acquisition failure
else:
last_failed_attempt = next((attempt for attempt in reversed(failure_report['failure_history']) if attempt['status'] == 'failed'), None)
if last_failed_attempt:
failure_report['failed_task'] = last_failed_attempt['task_id']
failure_report['failure_summary'] = f"Token acquisition failed with error: {last_failed_attempt.get('error_code', 'Unknown')}"
else:
# Case 3: Generic/unexpected failure
failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed']
if failed_tis:
# Heuristic: pick the one with the latest end_date that is not this task itself
failed_tis.sort(key=lambda x: x.end_date or datetime.min)
last_failed_ti = next((ti for ti in reversed(failed_tis) if ti.task_id != context['task_instance'].task_id), None)
if last_failed_ti:
failure_report['failed_task'] = last_failed_ti.task_id
failure_report['failure_summary'] = f"Task '{last_failed_ti.task_id}' failed unexpectedly."
failure_report['generic_error'] = {
'error_message': str(exception) if exception else f"Unexpected failure in task {last_failed_ti.task_id}. Check logs.",
'error_type': type(exception).__name__ if exception else "Unknown",
'traceback': "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
}
logger.info(f"Handling failure for URL: {url}")
logger.error(f" Failure Summary: {failure_report['failure_summary']}")
logger.error(f" Failed Task: {failure_report['failed_task']}")
# Using print to ensure the full JSON is visible in the logs without truncation
print("--- Detailed Failure Report ---")
print(json.dumps(failure_report, indent=2))
print("-----------------------------")
# For all failures, mark the URL as failed in Redis.
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
queue_name = params['queue_name']
fail_queue = f"{queue_name}_fail"
try:
client = _get_redis_client(redis_conn_id)
client.hset(fail_queue, url, json.dumps(failure_report, indent=2))
logger.info(f"Stored detailed failure info for URL '{url}' in fail hash '{fail_queue}'.")
except Exception as e:
logger.error(f"Critical error during failure handling in Redis for URL '{url}': {e}", exc_info=True)
raise AirflowException(f"Could not handle failure in Redis: {e}")
# --- YtdlpOpsOperator ---
def _get_account_pool(params: dict) -> list:
"""
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
Supports three modes for the 'account_pool' parameter:
1. Explicit List: If 'account_pool' contains a comma, it's treated as a comma-separated list.
2. Prefix-based Generation: If 'account_pool_size' is provided, 'account_pool' is treated as a prefix
to generate numbered accounts (e.g., prefix_01, prefix_02).
3. Single Account: If 'account_pool' has no comma and 'account_pool_size' is not provided, it's treated as a single account name.
If the pool is exhausted and auto-creation is enabled, it will generate a new account ID.
"""
account_pool_str = params.get('account_pool', 'default_account')
accounts = []
is_prefix_mode = False
if ',' in account_pool_str:
# Mode 1: Explicit comma-separated list
logger.info("Detected comma in 'account_pool', treating as an explicit list.")
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
else:
# Mode 2 or 3: Prefix-based generation OR single account
prefix = account_pool_str
pool_size_param = params.get('account_pool_size')
if pool_size_param is not None:
# Mode 2: Prefix mode
is_prefix_mode = True
logger.info("Detected 'account_pool_size', treating 'account_pool' as a prefix.")
try:
pool_size = int(pool_size_param)
if pool_size <= 0:
raise AirflowException("'account_pool_size' must be a positive integer for prefix-based generation.")
except (ValueError, TypeError):
raise AirflowException(f"'account_pool_size' must be an integer, but got: {pool_size_param}")
logger.info(f"Account pool size is set to: {pool_size}")
# Generate accounts like 'prefix_01', 'prefix_02', ..., 'prefix_10'
for i in range(1, pool_size + 1):
accounts.append(f"{prefix}_{i:02d}")
else:
# Mode 3: Single account mode
logger.info("No 'account_pool_size' provided. Treating 'account_pool' as a single account name.")
accounts = [prefix]
if not accounts:
raise AirflowException("Initial account pool is empty. Please check 'account_pool' and 'account_pool_size' parameters.")
logger.info(f"Generated initial account pool with {len(accounts)} accounts: {accounts}")
# --- Filter out banned/resting accounts by checking Redis ---
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
try:
redis_client = _get_redis_client(redis_conn_id)
active_accounts = []
for account in accounts:
status_key = f"account_status:{account}"
status_bytes = redis_client.hget(status_key, "status")
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
if status == 'BANNED':
logger.warning(f"Account '{account}' is BANNED. Skipping.")
continue
if 'RESTING' in status: # Check for 'RESTING' or 'RESTING (active in...)'
logger.info(f"Account '{account}' is RESTING. Skipping.")
continue
active_accounts.append(account)
if not active_accounts and accounts:
logger.error(f"All {len(accounts)} accounts in the pool are banned or resting.")
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
if auto_create and is_prefix_mode:
prefix = account_pool_str
new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}"
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
active_accounts.append(new_account_id)
else:
if not auto_create:
logger.error("Auto-creation is disabled. No workers can be scheduled.")
if not is_prefix_mode:
logger.error("Auto-creation is only supported for prefix-based account pools.")
raise AirflowException("All accounts in the configured pool are currently exhausted (banned or resting).")
if len(active_accounts) < len(accounts):
logger.info(f"Filtered account pool. Using {len(active_accounts)} of {len(accounts)} available accounts.")
accounts = active_accounts
except Exception as e:
logger.error(f"Could not filter accounts by status from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
if not accounts:
raise AirflowException("Account pool is empty after filtering. Please check account statuses in Redis or enable auto-creation.")
logger.info(f"Final active account pool with {len(accounts)} accounts: {accounts}")
return accounts
def pull_url_and_assign_account_callable(**context):
"""
Pulls a single URL from Redis and assigns an active account for the run.
If the queue is empty, it skips the DAG run.
Otherwise, it pushes the URL and account details to XCom.
"""
params = context['params']
ti = context['task_instance']
# --- Part 1: Pull URL from Redis ---
queue_name = params['queue_name']
redis_conn_id = params['redis_conn_id']
inbox_queue = f"{queue_name}_inbox"
logger.info(f"Attempting to pull one URL from Redis queue '{inbox_queue}'...")
client = _get_redis_client(redis_conn_id)
url_bytes = client.lpop(inbox_queue)
if not url_bytes:
logger.info("Queue is empty. Stopping this worker loop.")
raise AirflowSkipException("Redis queue is empty.")
url_to_process = url_bytes.decode('utf-8')
logger.info(f"Pulled URL '{url_to_process}' from the queue.")
ti.xcom_push(key='url_to_process', value=url_to_process)
# --- Part 2: Assign Account ---
logger.info("URL found, proceeding to assign an account.")
# Affinity logic: check if an account was passed from a previous run
account_id = params.get('current_account_id')
if account_id:
logger.info(f"Using account '{account_id}' passed from previous run (affinity).")
else:
logger.info("No account passed from previous run. Selecting a new one from the pool.")
account_pool = _get_account_pool(params)
account_id = random.choice(account_pool)
logger.info(f"Selected initial account '{account_id}'.")
ti.xcom_push(key='account_id', value=account_id)
ti.xcom_push(key='accounts_tried', value=[account_id])
def decide_what_to_do_next_callable(**context):
"""
Decides whether to continue the processing loop by triggering the next worker
or to stop the loop, based on task success, failure, or an empty queue.
"""
params = context['params']
dag_run = context['dag_run']
# Check if a failure was handled. If the 'handle_generic_failure' task was not skipped,
# it means a failure occurred somewhere in the pipeline.
handle_generic_failure_ti = dag_run.get_task_instance(task_id='handle_generic_failure')
if handle_generic_failure_ti and handle_generic_failure_ti.state != 'skipped':
logger.error(f"Failure handler task 'handle_generic_failure' was in state '{handle_generic_failure_ti.state}'. Stopping this processing lane.")
return 'mark_dag_run_as_failed'
# Check if the worker was skipped because the Redis queue was empty.
pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account')
if pull_task_instance and pull_task_instance.state == 'skipped':
logger.info("Worker was skipped because Redis queue was empty.")
retrigger_delay_on_empty_s = params.get('retrigger_delay_on_empty_s', 60)
if retrigger_delay_on_empty_s < 0:
logger.info(f"retrigger_delay_on_empty_s is {retrigger_delay_on_empty_s}. Stopping this worker loop.")
return 'stop_worker_lane_gracefully'
else:
logger.info(f"Queue is empty. Will re-trigger this worker loop after a delay of {retrigger_delay_on_empty_s}s.")
return 'continue_loop_and_trigger_next_run'
# If no failure was handled and the queue wasn't empty, it must be a success.
logger.info("All preceding tasks succeeded. Continuing the processing loop by triggering the next worker.")
return 'continue_loop_and_trigger_next_run'
def get_token_callable(**context):
"""Makes a single attempt to get a token from the Thrift service."""
ti = context['task_instance']
params = context['params']
# Determine which account to use (initial or retry)
# Pull from all upstreams, which might return a LazySelectSequence
xcom_results = ti.xcom_pull(task_ids=context['task'].upstream_task_ids, key='account_id')
# The result can be a single value or an iterable. We need to find the first valid item.
account_id = None
if hasattr(xcom_results, '__iter__') and not isinstance(xcom_results, str):
# It's a list, tuple, or LazySelectSequence. Find the first real value.
account_id = next((item for item in xcom_results if item is not None), None)
else:
# It's a single value
account_id = xcom_results
if not account_id:
raise AirflowException("Could not find a valid account_id in XCom from any upstream task.")
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
if not url:
logger.info("No URL pulled from XCom. Assuming upstream task was skipped. Ending task.")
return
host = params['service_ip']
port = int(params['service_port'])
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
# The value from templates_dict is already rendered by Airflow.
info_json_dir = context['templates_dict']['info_json_dir']
machine_id = params.get('machine_id') or socket.gethostname()
clients = params.get('clients')
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
client, transport = None, None
try:
client, transport = _get_thrift_client(host, port, timeout)
client.ping()
call_kwargs = {'accountId': account_id, 'updateType': TokenUpdateMode.AUTO, 'url': url, 'clients': clients, 'machineId': machine_id}
token_data = client.getOrRefreshToken(**call_kwargs)
# --- Log response details for debugging ---
response_summary = {
"has_infoJson": hasattr(token_data, 'infoJson') and bool(token_data.infoJson),
"infoJson_size": len(token_data.infoJson) if hasattr(token_data, 'infoJson') and token_data.infoJson else 0,
"has_ytdlpCommand": hasattr(token_data, 'ytdlpCommand') and bool(token_data.ytdlpCommand),
"proxy_type": next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr) and getattr(token_data, attr)), 'None'),
"jobId": getattr(token_data, 'jobId', None)
}
logger.info(f"Successfully retrieved token data from service. Response summary: {json.dumps(response_summary)}")
# --- Success Case ---
info_json = getattr(token_data, 'infoJson', None)
if info_json and json.loads(info_json):
video_id = _extract_video_id(url)
save_dir = info_json_dir or "."
os.makedirs(save_dir, exist_ok=True)
timestamp = int(time.time())
base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
ti.xcom_push(key='info_json_path', value=info_json_path)
# Log key details from the info.json to confirm success
try:
info_data = json.loads(info_json)
if isinstance(info_data, dict):
title = info_data.get('title', 'N/A')
uploader = info_data.get('uploader', 'N/A')
duration = info_data.get('duration_string', 'N/A')
logger.info(f"Successfully got info.json for video: '{title}' by '{uploader}' ({duration})")
except Exception as log_e:
logger.warning(f"Could not log info.json details: {log_e}")
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
ti.xcom_push(key='socks_proxy', value=getattr(token_data, proxy_attr) if proxy_attr else None)
ti.xcom_push(key='ytdlp_command', value=getattr(token_data, 'ytdlpCommand', None))
ti.xcom_push(key='successful_account_id', value=account_id) # For affinity
ti.xcom_push(key='get_token_succeeded', value=True)
else:
# This is a failure case: the service returned success but no usable data.
logger.error(f"Thrift call for account '{account_id}' succeeded but returned no info.json. Treating as failure.")
# The generic failure handler will pick up this exception.
raise AirflowException("Service returned success but info.json was empty or invalid.")
except (PBServiceException, PBUserException, TTransportException) as e:
error_context = getattr(e, 'context', None)
if isinstance(error_context, str):
try:
error_context = json.loads(error_context.replace("'", "\""))
except: pass
error_message = getattr(e, 'message', str(e))
error_code = getattr(e, 'errorCode', 'TRANSPORT_ERROR')
# Check for wrapped timeout exception to provide a clearer error message.
inner_exception = getattr(e, 'inner', getattr(e, '__cause__', None))
if isinstance(e, TTransportException) and isinstance(inner_exception, socket.timeout):
error_message = f"Socket timeout during Thrift call (wrapped in TTransportException)"
error_code = 'SOCKET_TIMEOUT'
error_details = {
'error_message': error_message,
'error_code': error_code,
'error_type': type(e).__name__,
'traceback': traceback.format_exc(),
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
}
proxy_url_info = f" with proxy '{error_details['proxy_url']}'" if error_details.get('proxy_url') else ""
if error_code == 'SOCKET_TIMEOUT':
logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} failed due to a socket timeout after {timeout} seconds.")
elif isinstance(e, TTransportException) and e.type == TTransportException.TIMED_OUT:
logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} timed out after {timeout} seconds.")
else:
logger.error(f"Thrift call failed for account '{account_id}'{proxy_url_info}. Exception: {error_details['error_message']}")
ti.xcom_push(key='error_details', value=error_details)
ti.xcom_push(key='get_token_succeeded', value=False)
# Always fail the task on any Thrift exception. The branch operator will inspect the failure.
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
finally:
if transport and transport.isOpen():
transport.close()
def handle_bannable_error_branch_callable(**context):
"""
Inspects a failed `get_token` task. If the failure was a "bannable" error,
it routes to the retry logic. Otherwise, it lets the DAG fail.
This task only runs if the upstream `get_token` task fails.
"""
ti = context['task_instance']
params = context['params']
# We know get_token failed because of the trigger_rule='one_failed'.
# Pull the error details it left behind.
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='error_details')
if not error_details:
logger.error("The 'get_token' task failed, but no error details were found in XCom. "
"This indicates an unexpected error. Letting the DAG fail.")
return None # Do nothing, let the group fail.
# We have error details, now check if the error is "bannable".
error_code = error_details.get('error_code', '').strip()
error_message = error_details.get('error_message', '').lower()
policy = params.get('on_bannable_failure', 'retry_with_new_account')
bannable_codes = ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "SOCKS5_CONNECTION_FAILED"]
is_bannable = error_code in bannable_codes
# Override bannable status for age-restricted content, which is not a true bot detection.
if is_bannable and ('confirm your age' in error_message or 'age-restricted' in error_message):
logger.warning(f"Error is age-related ('{error_code}'). Treating as a non-bannable failure to avoid banning the account.")
is_bannable = False
logger.info(f"Handling failure from 'get_token'. Error code: '{error_code}', Is Bannable: {is_bannable}, Policy: '{policy}'")
if is_bannable and policy == 'retry_with_new_account':
logger.info("Error is bannable and policy allows retry. Proceeding to ban first account and retry.")
return 'acquire_token_with_retry.ban_account_and_prepare_for_retry'
elif is_bannable: # and policy is 'stop_loop'
logger.warning("Error is bannable and policy is 'stop_loop'. Banning account and stopping.")
return 'acquire_token_with_retry.ban_account_and_fail'
else: # Not a bannable error
logger.error(f"Error '{error_code}' is not bannable. Letting the DAG fail.")
return None # Do nothing, let the group fail.
def assign_new_account_for_retry_callable(**context):
"""Selects a new, unused account for the retry attempt."""
ti = context['task_instance']
params = context['params']
accounts_tried = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='accounts_tried')
if not accounts_tried:
raise AirflowException("Cannot retry, list of previously tried accounts not found.")
logger.info(f"Policy is 'retry_with_new_account'. Selecting a new account. Already tried: {accounts_tried}")
try:
account_pool = _get_account_pool(params)
available_for_retry = [acc for acc in account_pool if acc not in accounts_tried]
new_account_id = None
if available_for_retry:
new_account_id = random.choice(available_for_retry)
else:
# No unused accounts left in the pool. Check if we can auto-create one.
logger.warning("No unused accounts available in the pool for a retry. Checking for auto-creation.")
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
account_pool_str = params.get('account_pool', 'default_account')
pool_size_param = params.get('account_pool_size')
is_prefix_mode = pool_size_param is not None and ',' not in account_pool_str
if auto_create and is_prefix_mode:
prefix = account_pool_str
new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}"
logger.warning(f"Auto-creating new account for retry: '{new_account_id}'")
else:
if not auto_create:
logger.error("Auto-creation is disabled.")
if not is_prefix_mode:
logger.error("Auto-creation is only supported for prefix-based account pools (requires 'account_pool_size').")
raise AirflowException("No other accounts available in the pool for a retry.")
accounts_tried.append(new_account_id)
logger.info(f"Selected new account for retry: '{new_account_id}'")
ti.xcom_push(key='account_id', value=new_account_id)
ti.xcom_push(key='accounts_tried', value=accounts_tried)
except Exception as e:
logger.error(f"Could not get a new account for retry: {e}")
raise AirflowException(f"Failed to assign new account for retry: {e}")
def handle_retry_failure_branch_callable(**context):
"""
Checks a failed `retry_get_token` task. If the failure was a handled Thrift error,
it triggers the banning of the second account/proxy.
This task only runs if the upstream `retry_get_token` task fails.
"""
ti = context['task_instance']
# We know retry_get_token failed. Check if it was a handled failure.
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details')
if not error_details:
logger.error("The 'retry_get_token' task failed unexpectedly before it could record error details. "
"Letting the DAG fail without banning the account/proxy.")
return None
# If we are here, it means the retry failed with a handled Thrift error.
# We will proceed to ban the second account and proxy.
logger.error("Retry attempt also failed with a handled Thrift error. Banning second account and proxy.")
return 'acquire_token_with_retry.ban_second_account_and_proxy'
def ban_first_account_callable(**context):
"""Bans the first account that failed due to a bannable error."""
ti = context['task_instance']
params = context['params']
# The account ID is pulled from the initial assignment task.
account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
if not account_to_ban:
logger.warning("Could not find the initial account ID to ban. Skipping.")
return
client, transport = None, None
try:
host = params['service_ip']
port = int(params['service_port'])
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
client, transport = _get_thrift_client(host, port, timeout)
reason = "Banned by Airflow worker due to bannable error on first attempt"
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
client.banAccount(accountId=account_to_ban, reason=reason)
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
except Exception as e:
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
# Don't fail the task, as this is a best-effort cleanup action.
finally:
if transport and transport.isOpen():
transport.close()
def ban_first_account_and_fail_callable(**context):
"""Bans the first account that failed, and then intentionally fails the task."""
ti = context['task_instance']
params = context['params']
# The account ID is pulled from the initial assignment task.
account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
if not account_to_ban:
logger.warning("Could not find the initial account ID to ban. Skipping.")
else:
client, transport = None, None
try:
host = params['service_ip']
port = int(params['service_port'])
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
client, transport = _get_thrift_client(host, port, timeout)
reason = "Banned by Airflow worker due to bannable error (policy is stop_loop)"
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
client.banAccount(accountId=account_to_ban, reason=reason)
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
except Exception as e:
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
# Log error, but continue to fail the task.
finally:
if transport and transport.isOpen():
transport.close()
# Intentionally fail the task to stop the DAG run as per policy.
reason = "Bannable error detected, policy is stop_loop."
logger.warning(f"INTENTIONAL FAILURE: This task is now failing itself as per the 'stop_loop' policy. Reason: {reason}")
raise AirflowException(f"Failing task as per policy. Reason: {reason}")
def ban_second_account_and_proxy_callable(**context):
"""Bans the second account and the proxy used in the failed retry, then fails the task."""
ti = context['task_instance']
params = context['params']
account_to_ban = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id')
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details')
proxy_to_ban = error_details.get('proxy_url') if error_details else None
if not account_to_ban and not proxy_to_ban:
logger.warning("Could not find an account or proxy to ban from the failed retry. Nothing to do.")
# Still fail the task to stop the DAG.
raise AirflowException("Token acquisition failed on retry, but no resources found to ban.")
client, transport = None, None
try:
host = params['service_ip']
port = int(params['service_port'])
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
client, transport = _get_thrift_client(host, port, timeout)
# Ban the second account
if account_to_ban:
reason = "Banned by Airflow worker due to failure on retry attempt"
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
try:
client.banAccount(accountId=account_to_ban, reason=reason)
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
except Exception as e:
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
# Ban the proxy
if proxy_to_ban:
server_identity = params.get('machine_id') or socket.gethostname()
logger.warning(f"Banning proxy '{proxy_to_ban}' for server '{server_identity}'.")
try:
client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity)
logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.")
except Exception as e:
logger.error(f"Failed to issue ban for proxy '{proxy_to_ban}': {e}", exc_info=True)
except Exception as e:
logger.error(f"An error occurred while trying to connect to the Thrift service to ban resources: {e}", exc_info=True)
# Log the error but continue to the failure exception, as this is a best-effort cleanup.
finally:
if transport and transport.isOpen():
transport.close()
# After attempting to ban, we must fail this task to fail the group.
logger.warning("INTENTIONAL FAILURE: This task is now failing itself to correctly signal the end of the retry process and stop the worker lane. The second account and/or proxy have been banned.")
raise AirflowException("Token acquisition failed on retry. Banned second account and proxy.")
def trigger_self_run_callable(**context):
"""Triggers a new run of this same DAG to continue the processing loop, with an optional delay."""
ti = context['task_instance']
params = context['params']
dag_run = context['dag_run']
# Check if this was triggered due to an empty queue to apply the specific delay.
pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account')
is_empty_queue_scenario = pull_task_instance and pull_task_instance.state == 'skipped'
delay = 0
if is_empty_queue_scenario:
# Use the specific delay for empty queues. Default to 60s.
delay = params.get('retrigger_delay_on_empty_s', 60)
logger.info(f"Queue was empty. Applying delay of {delay}s before re-triggering.")
else:
# For successful runs, re-trigger immediately by default.
logger.info("Worker finished successfully. Triggering next run of itself to continue the loop.")
delay = 0 # Immediate re-trigger on success.
if delay > 0:
logger.info(f"Waiting for {delay}s before triggering next run.")
time.sleep(delay)
logger.info(f"Finished waiting {delay}s. Proceeding to trigger next run.")
# Generate a unique run_id for the new worker run
run_id = f"self_triggered_{datetime.utcnow().isoformat()}"
# Pass through all original parameters to the new run.
conf_to_pass = {k: v for k, v in params.items() if v is not None}
# The new run will pull its own URL, so we ensure 'url' is not passed.
if 'url' in conf_to_pass:
del conf_to_pass['url']
# Pass the successful account ID to the next run for affinity.
# It could come from the first attempt or the retry.
successful_account_ids = ti.xcom_pull(task_ids=['acquire_token_with_retry.get_token', 'acquire_token_with_retry.retry_get_token'], key='successful_account_id')
successful_account_id = next((acc for acc in successful_account_ids if acc), None)
if successful_account_id:
conf_to_pass['current_account_id'] = successful_account_id
logger.info(f"Passing successful account '{successful_account_id}' to the next worker run for affinity.")
else:
# If no account was successful (e.g., empty queue scenario), don't pass one.
# The next run will pick a new one.
conf_to_pass['current_account_id'] = None
logger.info("No successful account ID found. Next worker will select a new account from the pool.")
logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}' and conf: {conf_to_pass}")
trigger_dag(
dag_id='ytdlp_ops_worker_per_url', # Trigger itself
run_id=run_id,
conf=conf_to_pass,
replace_microseconds=False
)
logger.info("Successfully triggered the next worker run.")
# =============================================================================
# DAG Definition
# =============================================================================
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=1),
'start_date': days_ago(1),
'queue': 'queue-dl002',
}
with DAG(
dag_id='ytdlp_ops_worker_per_url',
default_args=default_args,
schedule_interval=None,
catchup=False,
description='Self-sustaining worker DAG that processes URLs from a Redis queue in a continuous loop.',
doc_md="""
### YT-DLP Self-Sustaining Worker
This DAG is a self-sustaining worker that processes URLs in a continuous loop.
It is started by the `ytdlp_ops_orchestrator` (the "ignition system").
#### How it Works:
1. **Ignition:** An initial run is triggered by the orchestrator.
2. **Pull & Assign:** It pulls a URL from Redis and assigns an account for the job, reusing the last successful account if available (affinity).
3. **Get Token:** It calls the `ytdlp-ops-server` to get tokens and `info.json`. This step is encapsulated in a `TaskGroup` that handles a single retry on failure.
4. **Failure Handling:** If `get_token` fails with a "bannable" error (like bot detection), it follows the `on_bannable_failure` policy:
- `retry_with_new_account` (default): It bans the failing account, picks a new one, and retries the `get_token` call once. If the retry also fails, it bans the second account and the proxy, then stops the loop.
- `stop_loop`: It bans the account and stops the loop immediately.
5. **Download:** If tokens are retrieved successfully, it downloads the media.
6. **Continue or Stop:** After success, or a non-recoverable failure, it decides whether to continue the loop by re-triggering itself or to stop.
This creates a "processing lane" that runs independently until the queue is empty or a failure occurs.
""",
tags=['ytdlp', 'worker'],
params={
# Worker loop control params (passed from orchestrator)
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."),
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
# Worker-specific params
'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="Service IP. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),
'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."),
'account_pool': Param('default_account', type="string", description="Account pool prefix or comma-separated list."),
'account_pool_size': Param(None, type=["integer", "null"], description="If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
'machine_id': Param(None, type=["string", "null"], description="Identifier for the client machine, used for proxy usage tracking. If not set, worker hostname will be used."),
'clients': Param('mweb', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."),
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."),
'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."),
'on_bannable_failure': Param(
'retry_with_new_account',
type="string",
enum=['stop_loop', 'retry_with_new_account'],
title="On Bannable Failure Policy",
description="Policy for when a bannable error occurs. 'stop_loop' or 'retry_with_new_account'."
),
'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."),
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."),
'retrigger_delay_on_empty_s': Param(60, type="integer", description="Delay in seconds before re-triggering a worker if the queue is empty. Set to -1 to stop the loop."),
# --- Internal Worker Parameters (for self-triggering loop) ---
'current_account_id': Param(None, type=["string", "null"], description="[Internal] The account ID used by the previous run in this worker lane. Used to maintain account affinity."),
}
) as dag:
pull_url_and_assign_account = PythonOperator(
task_id='pull_url_and_assign_account',
python_callable=pull_url_and_assign_account_callable,
)
# --- Encapsulate token acquisition logic in a TaskGroup for visual clarity ---
with TaskGroup(group_id='acquire_token_with_retry') as acquire_token_group:
get_token = PythonOperator(
task_id='get_token',
python_callable=get_token_callable,
templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"},
)
handle_bannable_error_branch = BranchPythonOperator(
task_id='handle_bannable_error_branch',
python_callable=handle_bannable_error_branch_callable,
trigger_rule='one_failed', # This task should only run if get_token fails
)
# --- Retry Path ---
ban_account_and_prepare_for_retry = PythonOperator(
task_id='ban_account_and_prepare_for_retry',
python_callable=ban_first_account_callable,
)
assign_new_account_for_retry = PythonOperator(
task_id='assign_new_account_for_retry',
python_callable=assign_new_account_for_retry_callable,
)
retry_get_token = PythonOperator(
task_id='retry_get_token',
python_callable=get_token_callable,
templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"},
)
handle_retry_failure_branch = BranchPythonOperator(
task_id='handle_retry_failure_branch',
python_callable=handle_retry_failure_branch_callable,
trigger_rule='one_failed', # This task should only run if retry_get_token fails
)
ban_second_account_and_proxy = PythonOperator(
task_id='ban_second_account_and_proxy',
python_callable=ban_second_account_and_proxy_callable,
)
# --- Stop Path ---
ban_account_and_fail = PythonOperator(
task_id='ban_account_and_fail',
python_callable=ban_first_account_and_fail_callable,
)
# --- Internal Success Merge Point ---
token_acquisition_succeeded = DummyOperator(
task_id='token_acquisition_succeeded',
trigger_rule='one_success',
)
# --- Define dependencies within the TaskGroup ---
# The success dummy task is the merge point for the two possible success tasks.
[get_token, retry_get_token] >> token_acquisition_succeeded
# The first branch operator runs only if get_token fails.
get_token >> handle_bannable_error_branch
# It branches to the retry path or the hard-fail path.
handle_bannable_error_branch >> [ban_account_and_prepare_for_retry, ban_account_and_fail]
# The retry path
ban_account_and_prepare_for_retry >> assign_new_account_for_retry >> retry_get_token
# The second branch operator runs only if retry_get_token fails.
retry_get_token >> handle_retry_failure_branch
# It only branches to the final failure task.
handle_retry_failure_branch >> ban_second_account_and_proxy
# --- Main Execution Path (outside the TaskGroup) ---
download_and_probe = BashOperator(
task_id='download_and_probe',
bash_command="""
set -e
INFO_JSON_PATH_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') }}"
INFO_JSON_PATH_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path') }}"
INFO_JSON_PATH="${INFO_JSON_PATH_1:-$INFO_JSON_PATH_2}"
PROXY_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') }}"
PROXY_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy') }}"
PROXY="${PROXY_1:-$PROXY_2}"
FORMAT="{{ params.download_format }}"
DOWNLOAD_DIR="{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video') }}"
FILENAME_TEMPLATE="{{ params.output_path_template }}"
FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE"
echo "--- Starting Download Step ---"
echo "Info JSON Path: $INFO_JSON_PATH"
echo "Proxy: $PROXY"
echo "Format: $FORMAT"
echo "Download Directory: $DOWNLOAD_DIR"
echo "Full Output Path: $FULL_OUTPUT_PATH"
if [ -z "$INFO_JSON_PATH" ] || [ "$INFO_JSON_PATH" == "None" ] || [ ! -f "$INFO_JSON_PATH" ]; then
echo "Error: info.json path is missing or file does not exist ($INFO_JSON_PATH)."
exit 1
fi
CMD_ARRAY=(yt-dlp --load-info-json "$INFO_JSON_PATH")
if [ -n "$PROXY" ] && [ "$PROXY" != "None" ]; then
CMD_ARRAY+=(--proxy "$PROXY")
fi
CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename)
CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")"
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
EXIT_CODE=$?
echo "yt-dlp exited with code: $EXIT_CODE"
if [ $EXIT_CODE -ne 0 ]; then
echo "Error: yt-dlp command failed."
exit $EXIT_CODE
fi
if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
echo "Error: Download failed or did not produce a file."
exit 1
fi
echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME"
echo "--- Starting Probe Step ---"
echo "Probing downloaded file: $FINAL_FILENAME"
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt."
if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then
echo "Attempting one retry on probe failure..."
echo "Renaming to .part to attempt resuming download."
mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part"
# Re-run download command
echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")"
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
EXIT_CODE=$?
echo "yt-dlp retry exited with code: $EXIT_CODE"
if [ $EXIT_CODE -ne 0 ]; then
echo "Error: yt-dlp retry command failed."
exit $EXIT_CODE
fi
if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
echo "Error: Retry download failed or did not produce a file."
exit 1
fi
echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME"
# Re-probe
echo "Probing redownloaded file: $FINAL_FILENAME"
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2."
exit 2
fi
else
echo "Failing with exit code 2 due to probe failure (retries disabled)."
exit 2
fi
fi
echo "SUCCESS: Probe confirmed valid media file."
# Push the final filename for the success_task
echo "$FINAL_FILENAME"
""",
retries=0,
retry_delay=timedelta(minutes=1),
)
# --- Finalization Tasks ---
mark_url_as_success = PythonOperator(
task_id='mark_url_as_success',
python_callable=mark_url_as_success,
)
handle_generic_failure = PythonOperator(
task_id='handle_generic_failure',
python_callable=handle_failure_callable,
trigger_rule='one_failed', # Trigger if any upstream in the failure path fails
)
decide_next_step = BranchPythonOperator(
task_id='decide_what_to_do_next',
python_callable=decide_what_to_do_next_callable,
trigger_rule='all_done',
)
continue_loop_and_trigger_next_run = PythonOperator(
task_id='continue_loop_and_trigger_next_run',
python_callable=trigger_self_run_callable,
)
stop_worker_lane_gracefully = DummyOperator(task_id='stop_worker_lane_gracefully')
mark_dag_run_as_failed = BashOperator(task_id='mark_dag_run_as_failed', bash_command='exit 1')
# --- Define Task Dependencies ---
pull_url_and_assign_account >> acquire_token_group
# The TaskGroup's internal success task (`token_acquisition_succeeded`) is the trigger for the download.
# This is more explicit than depending on the entire group's state and prevents the skip issue.
dag.get_task('acquire_token_with_retry.token_acquisition_succeeded') >> download_and_probe
download_and_probe >> mark_url_as_success
# Define the failure path. The generic failure handler is set downstream of the two
# main tasks that can fail. Its 'one_failed' trigger rule ensures it only runs on failure.
# This explicit list avoids potential scheduler ambiguity.
[acquire_token_group, download_and_probe] >> handle_generic_failure
# Define the final decision point. This task must run after the success path completes
# OR after the failure path completes. Its 'all_done' trigger rule makes this possible.
mark_url_as_success >> decide_next_step
handle_generic_failure >> decide_next_step
decide_next_step >> [continue_loop_and_trigger_next_run, stop_worker_lane_gracefully, mark_dag_run_as_failed]