1167 lines
57 KiB
Python
1167 lines
57 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vim:fenc=utf-8
|
|
#
|
|
# Copyright © 2024 rl <rl@rlmbp>
|
|
#
|
|
# Distributed under terms of the MIT license.
|
|
|
|
"""
|
|
DAG for processing a single YouTube URL passed via DAG run configuration.
|
|
This is the "Worker" part of a Sensor/Worker pattern.
|
|
"""
|
|
|
|
from airflow import DAG
|
|
from airflow.exceptions import AirflowException, AirflowSkipException
|
|
from airflow.models import BaseOperator, Variable
|
|
from airflow.models.param import Param
|
|
from airflow.operators.bash import BashOperator
|
|
from airflow.operators.dummy import DummyOperator
|
|
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
|
from airflow.operators.dummy import DummyOperator
|
|
from airflow.providers.redis.hooks.redis import RedisHook
|
|
from airflow.utils.dates import days_ago
|
|
from airflow.utils.decorators import apply_defaults
|
|
from airflow.utils.task_group import TaskGroup
|
|
from datetime import datetime, timedelta
|
|
from airflow.api.common.trigger_dag import trigger_dag
|
|
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
|
|
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
from thrift.protocol import TBinaryProtocol
|
|
from thrift.transport import TSocket, TTransport
|
|
from thrift.transport.TTransport import TTransportException
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import redis
|
|
import socket
|
|
import time
|
|
import traceback
|
|
import inspect
|
|
import uuid
|
|
import uuid
|
|
|
|
# Import utility functions
|
|
from utils.redis_utils import _get_redis_client
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default settings
|
|
DEFAULT_QUEUE_NAME = 'video_queue'
|
|
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
DEFAULT_MAX_URLS = 1
|
|
DEFAULT_TIMEOUT = 180 # Default Thrift timeout in seconds
|
|
|
|
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="16.162.82.212")
|
|
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
|
|
|
|
# --- Helper Functions ---
|
|
|
|
def _get_thrift_client(host, port, timeout):
|
|
"""Helper to create and connect a Thrift client."""
|
|
transport = TSocket.TSocket(host, port)
|
|
transport.setTimeout(timeout * 1000)
|
|
transport = TTransport.TFramedTransport(transport)
|
|
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
|
client = YTTokenOpService.Client(protocol)
|
|
transport.open()
|
|
logger.info(f"Connected to Thrift server at {host}:{port}")
|
|
return client, transport
|
|
|
|
|
|
|
|
|
|
def _extract_video_id(url):
|
|
"""Extracts YouTube video ID from URL."""
|
|
if not url or not isinstance(url, str):
|
|
logger.debug("URL is empty or not a string, cannot extract video ID.")
|
|
return None
|
|
try:
|
|
video_id = None
|
|
if 'youtube.com/watch?v=' in url:
|
|
video_id = url.split('v=')[1].split('&')[0]
|
|
elif 'youtu.be/' in url:
|
|
video_id = url.split('youtu.be/')[1].split('?')[0]
|
|
|
|
if video_id and len(video_id) >= 11:
|
|
video_id = video_id[:11] # Standard ID length
|
|
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
|
|
return video_id
|
|
else:
|
|
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
|
|
return None
|
|
|
|
# --- Queue Management Callables (for success/failure reporting) ---
|
|
|
|
|
|
|
|
def mark_url_as_success(**context):
|
|
"""Moves URL from progress to result hash on success."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
|
|
if not url:
|
|
logger.warning("mark_url_as_success called but no URL found in DAG run parameters.")
|
|
return
|
|
|
|
queue_name = params['queue_name']
|
|
result_queue = f"{queue_name}_result"
|
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
|
|
# Pull results from previous tasks
|
|
info_json_path = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') or \
|
|
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path')
|
|
socks_proxy = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') or \
|
|
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy')
|
|
ytdlp_command = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='ytdlp_command') or \
|
|
ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='ytdlp_command')
|
|
downloaded_file_path = ti.xcom_pull(task_ids='download_and_probe')
|
|
|
|
logger.info(f"Handling success for URL: {url}")
|
|
logger.info(f" Downloaded File Path: {downloaded_file_path}")
|
|
|
|
result_data = {
|
|
'status': 'success',
|
|
'end_time': time.time(),
|
|
'info_json_path': info_json_path,
|
|
'socks_proxy': socks_proxy,
|
|
'ytdlp_command': ytdlp_command,
|
|
'downloaded_file_path': downloaded_file_path,
|
|
'url': url,
|
|
'dag_run_id': context['dag_run'].run_id,
|
|
}
|
|
|
|
try:
|
|
# In the worker pattern, there's no "progress" hash to remove from.
|
|
# We just add the result to the success hash.
|
|
client = _get_redis_client(redis_conn_id)
|
|
client.hset(result_queue, url, json.dumps(result_data))
|
|
logger.info(f"Stored success result for URL '{url}' in result hash '{result_queue}'.")
|
|
except Exception as e:
|
|
logger.error(f"Error handling success in Redis for URL '{url}': {e}", exc_info=True)
|
|
# Log error but don't fail the task, as the main work succeeded.
|
|
|
|
|
|
def handle_failure_callable(**context):
|
|
"""
|
|
Handles a failed processing run by recording rich, detailed error information to Redis.
|
|
"""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
dag_run = context['dag_run']
|
|
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
|
|
|
|
if not url:
|
|
# This can happen if pull_url_and_assign_account itself fails.
|
|
# We can't record a URL-specific failure, but we should log it.
|
|
failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed']
|
|
failed_task_ids = [ti.task_id for ti in failed_tis]
|
|
logger.error(f"handle_failure_callable was triggered for run {dag_run.run_id}, but no URL was found in XCom. "
|
|
f"This likely means an early task failed. Failed tasks in run: {failed_task_ids}")
|
|
return
|
|
|
|
# --- Start building the rich error report ---
|
|
failure_report = {
|
|
'url': url,
|
|
'dag_run_id': dag_run.run_id,
|
|
'failure_timestamp': datetime.now().isoformat(),
|
|
'failed_task': 'unknown',
|
|
'failure_summary': 'An unknown error occurred.',
|
|
'failure_history': [],
|
|
'download_error': None,
|
|
'generic_error': None
|
|
}
|
|
|
|
# --- Gather data from token acquisition attempts ---
|
|
# Attempt 1: get_token
|
|
get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.get_token')
|
|
if get_token_ti:
|
|
error_details_1 = ti.xcom_pull(task_ids=get_token_ti.task_id, key='error_details')
|
|
account_1 = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
|
|
|
|
attempt_1_report = {
|
|
'task_id': get_token_ti.task_id,
|
|
'account_id': account_1,
|
|
'status': get_token_ti.state,
|
|
'start_date': get_token_ti.start_date.isoformat() if get_token_ti.start_date else None,
|
|
'end_date': get_token_ti.end_date.isoformat() if get_token_ti.end_date else None,
|
|
}
|
|
if error_details_1:
|
|
attempt_1_report.update({
|
|
'proxy_url': error_details_1.get('proxy_url'),
|
|
'error_code': error_details_1.get('error_code'),
|
|
'error_message': error_details_1.get('error_message'),
|
|
})
|
|
failure_report['failure_history'].append(attempt_1_report)
|
|
|
|
# Attempt 2: retry_get_token
|
|
retry_get_token_ti = dag_run.get_task_instance('acquire_token_with_retry.retry_get_token')
|
|
# Only report on retry if it actually ran
|
|
if retry_get_token_ti and retry_get_token_ti.state:
|
|
error_details_2 = ti.xcom_pull(task_ids=retry_get_token_ti.task_id, key='error_details')
|
|
account_2 = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id')
|
|
|
|
attempt_2_report = {
|
|
'task_id': retry_get_token_ti.task_id,
|
|
'account_id': account_2,
|
|
'status': retry_get_token_ti.state,
|
|
'start_date': retry_get_token_ti.start_date.isoformat() if retry_get_token_ti.start_date else None,
|
|
'end_date': retry_get_token_ti.end_date.isoformat() if retry_get_token_ti.end_date else None,
|
|
}
|
|
if error_details_2:
|
|
attempt_2_report.update({
|
|
'proxy_url': error_details_2.get('proxy_url'),
|
|
'error_code': error_details_2.get('error_code'),
|
|
'error_message': error_details_2.get('error_message'),
|
|
})
|
|
failure_report['failure_history'].append(attempt_2_report)
|
|
|
|
# --- Identify the primary failure point ---
|
|
exception = context.get('exception')
|
|
|
|
# Case 1: Download & Probe failure
|
|
download_probe_ti = dag_run.get_task_instance('download_and_probe')
|
|
if download_probe_ti and download_probe_ti.state == 'failed':
|
|
failure_report['failed_task'] = download_probe_ti.task_id
|
|
failure_report['failure_summary'] = 'Download or probe failed after successful token acquisition.'
|
|
failure_report['download_error'] = {
|
|
'error_message': str(exception) if exception else "BashOperator failed. Check task logs for yt-dlp/ffmpeg output.",
|
|
'error_type': type(exception).__name__ if exception else "Unknown",
|
|
}
|
|
|
|
# Case 2: Token acquisition failure
|
|
else:
|
|
last_failed_attempt = next((attempt for attempt in reversed(failure_report['failure_history']) if attempt['status'] == 'failed'), None)
|
|
if last_failed_attempt:
|
|
failure_report['failed_task'] = last_failed_attempt['task_id']
|
|
failure_report['failure_summary'] = f"Token acquisition failed with error: {last_failed_attempt.get('error_code', 'Unknown')}"
|
|
else:
|
|
# Case 3: Generic/unexpected failure
|
|
failed_tis = [ti for ti in dag_run.get_task_instances() if ti.state == 'failed']
|
|
if failed_tis:
|
|
# Heuristic: pick the one with the latest end_date that is not this task itself
|
|
failed_tis.sort(key=lambda x: x.end_date or datetime.min)
|
|
last_failed_ti = next((ti for ti in reversed(failed_tis) if ti.task_id != context['task_instance'].task_id), None)
|
|
if last_failed_ti:
|
|
failure_report['failed_task'] = last_failed_ti.task_id
|
|
failure_report['failure_summary'] = f"Task '{last_failed_ti.task_id}' failed unexpectedly."
|
|
failure_report['generic_error'] = {
|
|
'error_message': str(exception) if exception else f"Unexpected failure in task {last_failed_ti.task_id}. Check logs.",
|
|
'error_type': type(exception).__name__ if exception else "Unknown",
|
|
'traceback': "".join(traceback.format_exception(etype=type(exception), value=exception, tb=exception.__traceback__)) if exception else "No traceback available."
|
|
}
|
|
|
|
logger.info(f"Handling failure for URL: {url}")
|
|
logger.error(f" Failure Summary: {failure_report['failure_summary']}")
|
|
logger.error(f" Failed Task: {failure_report['failed_task']}")
|
|
# Using print to ensure the full JSON is visible in the logs without truncation
|
|
print("--- Detailed Failure Report ---")
|
|
print(json.dumps(failure_report, indent=2))
|
|
print("-----------------------------")
|
|
|
|
# For all failures, mark the URL as failed in Redis.
|
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
queue_name = params['queue_name']
|
|
fail_queue = f"{queue_name}_fail"
|
|
try:
|
|
client = _get_redis_client(redis_conn_id)
|
|
client.hset(fail_queue, url, json.dumps(failure_report, indent=2))
|
|
logger.info(f"Stored detailed failure info for URL '{url}' in fail hash '{fail_queue}'.")
|
|
except Exception as e:
|
|
logger.error(f"Critical error during failure handling in Redis for URL '{url}': {e}", exc_info=True)
|
|
raise AirflowException(f"Could not handle failure in Redis: {e}")
|
|
|
|
# --- YtdlpOpsOperator ---
|
|
|
|
def _get_account_pool(params: dict) -> list:
|
|
"""
|
|
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
|
|
Supports three modes for the 'account_pool' parameter:
|
|
1. Explicit List: If 'account_pool' contains a comma, it's treated as a comma-separated list.
|
|
2. Prefix-based Generation: If 'account_pool_size' is provided, 'account_pool' is treated as a prefix
|
|
to generate numbered accounts (e.g., prefix_01, prefix_02).
|
|
3. Single Account: If 'account_pool' has no comma and 'account_pool_size' is not provided, it's treated as a single account name.
|
|
If the pool is exhausted and auto-creation is enabled, it will generate a new account ID.
|
|
"""
|
|
account_pool_str = params.get('account_pool', 'default_account')
|
|
accounts = []
|
|
is_prefix_mode = False
|
|
|
|
if ',' in account_pool_str:
|
|
# Mode 1: Explicit comma-separated list
|
|
logger.info("Detected comma in 'account_pool', treating as an explicit list.")
|
|
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
|
|
else:
|
|
# Mode 2 or 3: Prefix-based generation OR single account
|
|
prefix = account_pool_str
|
|
pool_size_param = params.get('account_pool_size')
|
|
|
|
if pool_size_param is not None:
|
|
# Mode 2: Prefix mode
|
|
is_prefix_mode = True
|
|
logger.info("Detected 'account_pool_size', treating 'account_pool' as a prefix.")
|
|
|
|
try:
|
|
pool_size = int(pool_size_param)
|
|
if pool_size <= 0:
|
|
raise AirflowException("'account_pool_size' must be a positive integer for prefix-based generation.")
|
|
except (ValueError, TypeError):
|
|
raise AirflowException(f"'account_pool_size' must be an integer, but got: {pool_size_param}")
|
|
|
|
logger.info(f"Account pool size is set to: {pool_size}")
|
|
|
|
# Generate accounts like 'prefix_01', 'prefix_02', ..., 'prefix_10'
|
|
for i in range(1, pool_size + 1):
|
|
accounts.append(f"{prefix}_{i:02d}")
|
|
else:
|
|
# Mode 3: Single account mode
|
|
logger.info("No 'account_pool_size' provided. Treating 'account_pool' as a single account name.")
|
|
accounts = [prefix]
|
|
|
|
if not accounts:
|
|
raise AirflowException("Initial account pool is empty. Please check 'account_pool' and 'account_pool_size' parameters.")
|
|
|
|
logger.info(f"Generated initial account pool with {len(accounts)} accounts: {accounts}")
|
|
|
|
# --- Filter out banned/resting accounts by checking Redis ---
|
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
try:
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
active_accounts = []
|
|
for account in accounts:
|
|
status_key = f"account_status:{account}"
|
|
status_bytes = redis_client.hget(status_key, "status")
|
|
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
|
|
|
|
if status == 'BANNED':
|
|
logger.warning(f"Account '{account}' is BANNED. Skipping.")
|
|
continue
|
|
if 'RESTING' in status: # Check for 'RESTING' or 'RESTING (active in...)'
|
|
logger.info(f"Account '{account}' is RESTING. Skipping.")
|
|
continue
|
|
|
|
active_accounts.append(account)
|
|
|
|
if not active_accounts and accounts:
|
|
logger.error(f"All {len(accounts)} accounts in the pool are banned or resting.")
|
|
|
|
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
|
|
if auto_create and is_prefix_mode:
|
|
prefix = account_pool_str
|
|
new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}"
|
|
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
|
|
active_accounts.append(new_account_id)
|
|
else:
|
|
if not auto_create:
|
|
logger.error("Auto-creation is disabled. No workers can be scheduled.")
|
|
if not is_prefix_mode:
|
|
logger.error("Auto-creation is only supported for prefix-based account pools.")
|
|
raise AirflowException("All accounts in the configured pool are currently exhausted (banned or resting).")
|
|
|
|
if len(active_accounts) < len(accounts):
|
|
logger.info(f"Filtered account pool. Using {len(active_accounts)} of {len(accounts)} available accounts.")
|
|
|
|
accounts = active_accounts
|
|
|
|
except Exception as e:
|
|
logger.error(f"Could not filter accounts by status from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
|
|
|
|
if not accounts:
|
|
raise AirflowException("Account pool is empty after filtering. Please check account statuses in Redis or enable auto-creation.")
|
|
|
|
logger.info(f"Final active account pool with {len(accounts)} accounts: {accounts}")
|
|
return accounts
|
|
|
|
|
|
|
|
def pull_url_and_assign_account_callable(**context):
|
|
"""
|
|
Pulls a single URL from Redis and assigns an active account for the run.
|
|
If the queue is empty, it skips the DAG run.
|
|
Otherwise, it pushes the URL and account details to XCom.
|
|
"""
|
|
params = context['params']
|
|
ti = context['task_instance']
|
|
|
|
# --- Part 1: Pull URL from Redis ---
|
|
queue_name = params['queue_name']
|
|
redis_conn_id = params['redis_conn_id']
|
|
inbox_queue = f"{queue_name}_inbox"
|
|
|
|
logger.info(f"Attempting to pull one URL from Redis queue '{inbox_queue}'...")
|
|
client = _get_redis_client(redis_conn_id)
|
|
url_bytes = client.lpop(inbox_queue)
|
|
|
|
if not url_bytes:
|
|
logger.info("Queue is empty. Stopping this worker loop.")
|
|
raise AirflowSkipException("Redis queue is empty.")
|
|
|
|
url_to_process = url_bytes.decode('utf-8')
|
|
logger.info(f"Pulled URL '{url_to_process}' from the queue.")
|
|
ti.xcom_push(key='url_to_process', value=url_to_process)
|
|
|
|
# --- Part 2: Assign Account ---
|
|
logger.info("URL found, proceeding to assign an account.")
|
|
# Affinity logic: check if an account was passed from a previous run
|
|
account_id = params.get('current_account_id')
|
|
if account_id:
|
|
logger.info(f"Using account '{account_id}' passed from previous run (affinity).")
|
|
else:
|
|
logger.info("No account passed from previous run. Selecting a new one from the pool.")
|
|
account_pool = _get_account_pool(params)
|
|
account_id = random.choice(account_pool)
|
|
logger.info(f"Selected initial account '{account_id}'.")
|
|
|
|
ti.xcom_push(key='account_id', value=account_id)
|
|
ti.xcom_push(key='accounts_tried', value=[account_id])
|
|
|
|
|
|
def decide_what_to_do_next_callable(**context):
|
|
"""
|
|
Decides whether to continue the processing loop by triggering the next worker
|
|
or to stop the loop, based on task success, failure, or an empty queue.
|
|
"""
|
|
params = context['params']
|
|
dag_run = context['dag_run']
|
|
|
|
# Check if a failure was handled. If the 'handle_generic_failure' task was not skipped,
|
|
# it means a failure occurred somewhere in the pipeline.
|
|
handle_generic_failure_ti = dag_run.get_task_instance(task_id='handle_generic_failure')
|
|
if handle_generic_failure_ti and handle_generic_failure_ti.state != 'skipped':
|
|
logger.error(f"Failure handler task 'handle_generic_failure' was in state '{handle_generic_failure_ti.state}'. Stopping this processing lane.")
|
|
return 'mark_dag_run_as_failed'
|
|
|
|
# Check if the worker was skipped because the Redis queue was empty.
|
|
pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account')
|
|
if pull_task_instance and pull_task_instance.state == 'skipped':
|
|
logger.info("Worker was skipped because Redis queue was empty.")
|
|
retrigger_delay_on_empty_s = params.get('retrigger_delay_on_empty_s', 60)
|
|
|
|
if retrigger_delay_on_empty_s < 0:
|
|
logger.info(f"retrigger_delay_on_empty_s is {retrigger_delay_on_empty_s}. Stopping this worker loop.")
|
|
return 'stop_worker_lane_gracefully'
|
|
else:
|
|
logger.info(f"Queue is empty. Will re-trigger this worker loop after a delay of {retrigger_delay_on_empty_s}s.")
|
|
return 'continue_loop_and_trigger_next_run'
|
|
|
|
# If no failure was handled and the queue wasn't empty, it must be a success.
|
|
logger.info("All preceding tasks succeeded. Continuing the processing loop by triggering the next worker.")
|
|
return 'continue_loop_and_trigger_next_run'
|
|
|
|
def get_token_callable(**context):
|
|
"""Makes a single attempt to get a token from the Thrift service."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
# Determine which account to use (initial or retry)
|
|
# Pull from all upstreams, which might return a LazySelectSequence
|
|
xcom_results = ti.xcom_pull(task_ids=context['task'].upstream_task_ids, key='account_id')
|
|
|
|
# The result can be a single value or an iterable. We need to find the first valid item.
|
|
account_id = None
|
|
if hasattr(xcom_results, '__iter__') and not isinstance(xcom_results, str):
|
|
# It's a list, tuple, or LazySelectSequence. Find the first real value.
|
|
account_id = next((item for item in xcom_results if item is not None), None)
|
|
else:
|
|
# It's a single value
|
|
account_id = xcom_results
|
|
|
|
if not account_id:
|
|
raise AirflowException("Could not find a valid account_id in XCom from any upstream task.")
|
|
|
|
url = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='url_to_process')
|
|
if not url:
|
|
logger.info("No URL pulled from XCom. Assuming upstream task was skipped. Ending task.")
|
|
return
|
|
|
|
host = params['service_ip']
|
|
port = int(params['service_port'])
|
|
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
# The value from templates_dict is already rendered by Airflow.
|
|
info_json_dir = context['templates_dict']['info_json_dir']
|
|
machine_id = params.get('machine_id') or socket.gethostname()
|
|
clients = params.get('clients')
|
|
|
|
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
|
|
client, transport = None, None
|
|
try:
|
|
client, transport = _get_thrift_client(host, port, timeout)
|
|
client.ping()
|
|
|
|
call_kwargs = {'accountId': account_id, 'updateType': TokenUpdateMode.AUTO, 'url': url, 'clients': clients, 'machineId': machine_id}
|
|
token_data = client.getOrRefreshToken(**call_kwargs)
|
|
|
|
# --- Log response details for debugging ---
|
|
response_summary = {
|
|
"has_infoJson": hasattr(token_data, 'infoJson') and bool(token_data.infoJson),
|
|
"infoJson_size": len(token_data.infoJson) if hasattr(token_data, 'infoJson') and token_data.infoJson else 0,
|
|
"has_ytdlpCommand": hasattr(token_data, 'ytdlpCommand') and bool(token_data.ytdlpCommand),
|
|
"proxy_type": next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr) and getattr(token_data, attr)), 'None'),
|
|
"jobId": getattr(token_data, 'jobId', None)
|
|
}
|
|
logger.info(f"Successfully retrieved token data from service. Response summary: {json.dumps(response_summary)}")
|
|
|
|
# --- Success Case ---
|
|
info_json = getattr(token_data, 'infoJson', None)
|
|
if info_json and json.loads(info_json):
|
|
video_id = _extract_video_id(url)
|
|
save_dir = info_json_dir or "."
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
timestamp = int(time.time())
|
|
base_filename = f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json"
|
|
info_json_path = os.path.join(save_dir, base_filename)
|
|
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
f.write(info_json)
|
|
ti.xcom_push(key='info_json_path', value=info_json_path)
|
|
|
|
# Log key details from the info.json to confirm success
|
|
try:
|
|
info_data = json.loads(info_json)
|
|
if isinstance(info_data, dict):
|
|
title = info_data.get('title', 'N/A')
|
|
uploader = info_data.get('uploader', 'N/A')
|
|
duration = info_data.get('duration_string', 'N/A')
|
|
logger.info(f"Successfully got info.json for video: '{title}' by '{uploader}' ({duration})")
|
|
except Exception as log_e:
|
|
logger.warning(f"Could not log info.json details: {log_e}")
|
|
|
|
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
ti.xcom_push(key='socks_proxy', value=getattr(token_data, proxy_attr) if proxy_attr else None)
|
|
ti.xcom_push(key='ytdlp_command', value=getattr(token_data, 'ytdlpCommand', None))
|
|
ti.xcom_push(key='successful_account_id', value=account_id) # For affinity
|
|
ti.xcom_push(key='get_token_succeeded', value=True)
|
|
else:
|
|
# This is a failure case: the service returned success but no usable data.
|
|
logger.error(f"Thrift call for account '{account_id}' succeeded but returned no info.json. Treating as failure.")
|
|
# The generic failure handler will pick up this exception.
|
|
raise AirflowException("Service returned success but info.json was empty or invalid.")
|
|
|
|
except (PBServiceException, PBUserException, TTransportException) as e:
|
|
error_context = getattr(e, 'context', None)
|
|
if isinstance(error_context, str):
|
|
try:
|
|
error_context = json.loads(error_context.replace("'", "\""))
|
|
except: pass
|
|
|
|
error_message = getattr(e, 'message', str(e))
|
|
error_code = getattr(e, 'errorCode', 'TRANSPORT_ERROR')
|
|
|
|
# Check for wrapped timeout exception to provide a clearer error message.
|
|
inner_exception = getattr(e, 'inner', getattr(e, '__cause__', None))
|
|
if isinstance(e, TTransportException) and isinstance(inner_exception, socket.timeout):
|
|
error_message = f"Socket timeout during Thrift call (wrapped in TTransportException)"
|
|
error_code = 'SOCKET_TIMEOUT'
|
|
|
|
error_details = {
|
|
'error_message': error_message,
|
|
'error_code': error_code,
|
|
'error_type': type(e).__name__,
|
|
'traceback': traceback.format_exc(),
|
|
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
|
|
}
|
|
|
|
proxy_url_info = f" with proxy '{error_details['proxy_url']}'" if error_details.get('proxy_url') else ""
|
|
|
|
if error_code == 'SOCKET_TIMEOUT':
|
|
logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} failed due to a socket timeout after {timeout} seconds.")
|
|
elif isinstance(e, TTransportException) and e.type == TTransportException.TIMED_OUT:
|
|
logger.error(f"Thrift call for account '{account_id}'{proxy_url_info} timed out after {timeout} seconds.")
|
|
else:
|
|
logger.error(f"Thrift call failed for account '{account_id}'{proxy_url_info}. Exception: {error_details['error_message']}")
|
|
|
|
ti.xcom_push(key='error_details', value=error_details)
|
|
ti.xcom_push(key='get_token_succeeded', value=False)
|
|
|
|
# Always fail the task on any Thrift exception. The branch operator will inspect the failure.
|
|
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
|
|
finally:
|
|
if transport and transport.isOpen():
|
|
transport.close()
|
|
|
|
|
|
def handle_bannable_error_branch_callable(**context):
|
|
"""
|
|
Inspects a failed `get_token` task. If the failure was a "bannable" error,
|
|
it routes to the retry logic. Otherwise, it lets the DAG fail.
|
|
This task only runs if the upstream `get_token` task fails.
|
|
"""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
# We know get_token failed because of the trigger_rule='one_failed'.
|
|
# Pull the error details it left behind.
|
|
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='error_details')
|
|
if not error_details:
|
|
logger.error("The 'get_token' task failed, but no error details were found in XCom. "
|
|
"This indicates an unexpected error. Letting the DAG fail.")
|
|
return None # Do nothing, let the group fail.
|
|
|
|
# We have error details, now check if the error is "bannable".
|
|
error_code = error_details.get('error_code', '').strip()
|
|
error_message = error_details.get('error_message', '').lower()
|
|
policy = params.get('on_bannable_failure', 'retry_with_new_account')
|
|
bannable_codes = ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "SOCKS5_CONNECTION_FAILED"]
|
|
is_bannable = error_code in bannable_codes
|
|
|
|
# Override bannable status for age-restricted content, which is not a true bot detection.
|
|
if is_bannable and ('confirm your age' in error_message or 'age-restricted' in error_message):
|
|
logger.warning(f"Error is age-related ('{error_code}'). Treating as a non-bannable failure to avoid banning the account.")
|
|
is_bannable = False
|
|
|
|
logger.info(f"Handling failure from 'get_token'. Error code: '{error_code}', Is Bannable: {is_bannable}, Policy: '{policy}'")
|
|
|
|
if is_bannable and policy == 'retry_with_new_account':
|
|
logger.info("Error is bannable and policy allows retry. Proceeding to ban first account and retry.")
|
|
return 'acquire_token_with_retry.ban_account_and_prepare_for_retry'
|
|
elif is_bannable: # and policy is 'stop_loop'
|
|
logger.warning("Error is bannable and policy is 'stop_loop'. Banning account and stopping.")
|
|
return 'acquire_token_with_retry.ban_account_and_fail'
|
|
else: # Not a bannable error
|
|
logger.error(f"Error '{error_code}' is not bannable. Letting the DAG fail.")
|
|
return None # Do nothing, let the group fail.
|
|
|
|
|
|
def assign_new_account_for_retry_callable(**context):
|
|
"""Selects a new, unused account for the retry attempt."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
accounts_tried = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='accounts_tried')
|
|
if not accounts_tried:
|
|
raise AirflowException("Cannot retry, list of previously tried accounts not found.")
|
|
|
|
logger.info(f"Policy is 'retry_with_new_account'. Selecting a new account. Already tried: {accounts_tried}")
|
|
try:
|
|
account_pool = _get_account_pool(params)
|
|
available_for_retry = [acc for acc in account_pool if acc not in accounts_tried]
|
|
|
|
new_account_id = None
|
|
if available_for_retry:
|
|
new_account_id = random.choice(available_for_retry)
|
|
else:
|
|
# No unused accounts left in the pool. Check if we can auto-create one.
|
|
logger.warning("No unused accounts available in the pool for a retry. Checking for auto-creation.")
|
|
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
|
|
account_pool_str = params.get('account_pool', 'default_account')
|
|
pool_size_param = params.get('account_pool_size')
|
|
is_prefix_mode = pool_size_param is not None and ',' not in account_pool_str
|
|
|
|
if auto_create and is_prefix_mode:
|
|
prefix = account_pool_str
|
|
new_account_id = f"{prefix}-auto-{str(uuid.uuid4())[:8]}"
|
|
logger.warning(f"Auto-creating new account for retry: '{new_account_id}'")
|
|
else:
|
|
if not auto_create:
|
|
logger.error("Auto-creation is disabled.")
|
|
if not is_prefix_mode:
|
|
logger.error("Auto-creation is only supported for prefix-based account pools (requires 'account_pool_size').")
|
|
raise AirflowException("No other accounts available in the pool for a retry.")
|
|
|
|
accounts_tried.append(new_account_id)
|
|
|
|
logger.info(f"Selected new account for retry: '{new_account_id}'")
|
|
ti.xcom_push(key='account_id', value=new_account_id)
|
|
ti.xcom_push(key='accounts_tried', value=accounts_tried)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Could not get a new account for retry: {e}")
|
|
raise AirflowException(f"Failed to assign new account for retry: {e}")
|
|
|
|
|
|
def handle_retry_failure_branch_callable(**context):
|
|
"""
|
|
Checks a failed `retry_get_token` task. If the failure was a handled Thrift error,
|
|
it triggers the banning of the second account/proxy.
|
|
This task only runs if the upstream `retry_get_token` task fails.
|
|
"""
|
|
ti = context['task_instance']
|
|
|
|
# We know retry_get_token failed. Check if it was a handled failure.
|
|
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details')
|
|
|
|
if not error_details:
|
|
logger.error("The 'retry_get_token' task failed unexpectedly before it could record error details. "
|
|
"Letting the DAG fail without banning the account/proxy.")
|
|
return None
|
|
|
|
# If we are here, it means the retry failed with a handled Thrift error.
|
|
# We will proceed to ban the second account and proxy.
|
|
logger.error("Retry attempt also failed with a handled Thrift error. Banning second account and proxy.")
|
|
return 'acquire_token_with_retry.ban_second_account_and_proxy'
|
|
|
|
|
|
def ban_first_account_callable(**context):
|
|
"""Bans the first account that failed due to a bannable error."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
# The account ID is pulled from the initial assignment task.
|
|
account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
|
|
if not account_to_ban:
|
|
logger.warning("Could not find the initial account ID to ban. Skipping.")
|
|
return
|
|
|
|
client, transport = None, None
|
|
try:
|
|
host = params['service_ip']
|
|
port = int(params['service_port'])
|
|
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
client, transport = _get_thrift_client(host, port, timeout)
|
|
|
|
reason = "Banned by Airflow worker due to bannable error on first attempt"
|
|
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
|
|
client.banAccount(accountId=account_to_ban, reason=reason)
|
|
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
|
|
except Exception as e:
|
|
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
|
|
# Don't fail the task, as this is a best-effort cleanup action.
|
|
finally:
|
|
if transport and transport.isOpen():
|
|
transport.close()
|
|
|
|
|
|
def ban_first_account_and_fail_callable(**context):
|
|
"""Bans the first account that failed, and then intentionally fails the task."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
# The account ID is pulled from the initial assignment task.
|
|
account_to_ban = ti.xcom_pull(task_ids='pull_url_and_assign_account', key='account_id')
|
|
if not account_to_ban:
|
|
logger.warning("Could not find the initial account ID to ban. Skipping.")
|
|
else:
|
|
client, transport = None, None
|
|
try:
|
|
host = params['service_ip']
|
|
port = int(params['service_port'])
|
|
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
client, transport = _get_thrift_client(host, port, timeout)
|
|
|
|
reason = "Banned by Airflow worker due to bannable error (policy is stop_loop)"
|
|
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
|
|
client.banAccount(accountId=account_to_ban, reason=reason)
|
|
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
|
|
except Exception as e:
|
|
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
|
|
# Log error, but continue to fail the task.
|
|
finally:
|
|
if transport and transport.isOpen():
|
|
transport.close()
|
|
|
|
# Intentionally fail the task to stop the DAG run as per policy.
|
|
reason = "Bannable error detected, policy is stop_loop."
|
|
logger.warning(f"INTENTIONAL FAILURE: This task is now failing itself as per the 'stop_loop' policy. Reason: {reason}")
|
|
raise AirflowException(f"Failing task as per policy. Reason: {reason}")
|
|
|
|
|
|
def ban_second_account_and_proxy_callable(**context):
|
|
"""Bans the second account and the proxy used in the failed retry, then fails the task."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
account_to_ban = ti.xcom_pull(task_ids='acquire_token_with_retry.assign_new_account_for_retry', key='account_id')
|
|
error_details = ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='error_details')
|
|
proxy_to_ban = error_details.get('proxy_url') if error_details else None
|
|
|
|
if not account_to_ban and not proxy_to_ban:
|
|
logger.warning("Could not find an account or proxy to ban from the failed retry. Nothing to do.")
|
|
# Still fail the task to stop the DAG.
|
|
raise AirflowException("Token acquisition failed on retry, but no resources found to ban.")
|
|
|
|
client, transport = None, None
|
|
try:
|
|
host = params['service_ip']
|
|
port = int(params['service_port'])
|
|
timeout = int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
client, transport = _get_thrift_client(host, port, timeout)
|
|
|
|
# Ban the second account
|
|
if account_to_ban:
|
|
reason = "Banned by Airflow worker due to failure on retry attempt"
|
|
logger.warning(f"Banning account '{account_to_ban}'. Reason: {reason}")
|
|
try:
|
|
client.banAccount(accountId=account_to_ban, reason=reason)
|
|
logger.info(f"Successfully sent request to ban account '{account_to_ban}'.")
|
|
except Exception as e:
|
|
logger.error(f"Failed to issue ban for account '{account_to_ban}': {e}", exc_info=True)
|
|
|
|
# Ban the proxy
|
|
if proxy_to_ban:
|
|
server_identity = params.get('machine_id') or socket.gethostname()
|
|
logger.warning(f"Banning proxy '{proxy_to_ban}' for server '{server_identity}'.")
|
|
try:
|
|
client.banProxy(proxyUrl=proxy_to_ban, serverIdentity=server_identity)
|
|
logger.info(f"Successfully sent request to ban proxy '{proxy_to_ban}'.")
|
|
except Exception as e:
|
|
logger.error(f"Failed to issue ban for proxy '{proxy_to_ban}': {e}", exc_info=True)
|
|
|
|
except Exception as e:
|
|
logger.error(f"An error occurred while trying to connect to the Thrift service to ban resources: {e}", exc_info=True)
|
|
# Log the error but continue to the failure exception, as this is a best-effort cleanup.
|
|
finally:
|
|
if transport and transport.isOpen():
|
|
transport.close()
|
|
|
|
# After attempting to ban, we must fail this task to fail the group.
|
|
logger.warning("INTENTIONAL FAILURE: This task is now failing itself to correctly signal the end of the retry process and stop the worker lane. The second account and/or proxy have been banned.")
|
|
raise AirflowException("Token acquisition failed on retry. Banned second account and proxy.")
|
|
|
|
|
|
def trigger_self_run_callable(**context):
|
|
"""Triggers a new run of this same DAG to continue the processing loop, with an optional delay."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
dag_run = context['dag_run']
|
|
|
|
# Check if this was triggered due to an empty queue to apply the specific delay.
|
|
pull_task_instance = dag_run.get_task_instance(task_id='pull_url_and_assign_account')
|
|
is_empty_queue_scenario = pull_task_instance and pull_task_instance.state == 'skipped'
|
|
|
|
delay = 0
|
|
if is_empty_queue_scenario:
|
|
# Use the specific delay for empty queues. Default to 60s.
|
|
delay = params.get('retrigger_delay_on_empty_s', 60)
|
|
logger.info(f"Queue was empty. Applying delay of {delay}s before re-triggering.")
|
|
else:
|
|
# For successful runs, re-trigger immediately by default.
|
|
logger.info("Worker finished successfully. Triggering next run of itself to continue the loop.")
|
|
delay = 0 # Immediate re-trigger on success.
|
|
|
|
if delay > 0:
|
|
logger.info(f"Waiting for {delay}s before triggering next run.")
|
|
time.sleep(delay)
|
|
logger.info(f"Finished waiting {delay}s. Proceeding to trigger next run.")
|
|
|
|
# Generate a unique run_id for the new worker run
|
|
run_id = f"self_triggered_{datetime.utcnow().isoformat()}"
|
|
|
|
# Pass through all original parameters to the new run.
|
|
conf_to_pass = {k: v for k, v in params.items() if v is not None}
|
|
|
|
# The new run will pull its own URL, so we ensure 'url' is not passed.
|
|
if 'url' in conf_to_pass:
|
|
del conf_to_pass['url']
|
|
|
|
# Pass the successful account ID to the next run for affinity.
|
|
# It could come from the first attempt or the retry.
|
|
successful_account_ids = ti.xcom_pull(task_ids=['acquire_token_with_retry.get_token', 'acquire_token_with_retry.retry_get_token'], key='successful_account_id')
|
|
successful_account_id = next((acc for acc in successful_account_ids if acc), None)
|
|
|
|
if successful_account_id:
|
|
conf_to_pass['current_account_id'] = successful_account_id
|
|
logger.info(f"Passing successful account '{successful_account_id}' to the next worker run for affinity.")
|
|
else:
|
|
# If no account was successful (e.g., empty queue scenario), don't pass one.
|
|
# The next run will pick a new one.
|
|
conf_to_pass['current_account_id'] = None
|
|
logger.info("No successful account ID found. Next worker will select a new account from the pool.")
|
|
|
|
logger.info(f"Triggering 'ytdlp_ops_worker_per_url' with run_id '{run_id}' and conf: {conf_to_pass}")
|
|
|
|
trigger_dag(
|
|
dag_id='ytdlp_ops_worker_per_url', # Trigger itself
|
|
run_id=run_id,
|
|
conf=conf_to_pass,
|
|
replace_microseconds=False
|
|
)
|
|
logger.info("Successfully triggered the next worker run.")
|
|
|
|
|
|
# =============================================================================
|
|
# DAG Definition
|
|
# =============================================================================
|
|
|
|
default_args = {
|
|
'owner': 'airflow',
|
|
'depends_on_past': False,
|
|
'email_on_failure': False,
|
|
'email_on_retry': False,
|
|
'retries': 0,
|
|
'retry_delay': timedelta(minutes=1),
|
|
'start_date': days_ago(1),
|
|
'queue': 'queue-dl002',
|
|
}
|
|
|
|
with DAG(
|
|
dag_id='ytdlp_ops_worker_per_url',
|
|
default_args=default_args,
|
|
schedule_interval=None,
|
|
catchup=False,
|
|
description='Self-sustaining worker DAG that processes URLs from a Redis queue in a continuous loop.',
|
|
doc_md="""
|
|
### YT-DLP Self-Sustaining Worker
|
|
|
|
This DAG is a self-sustaining worker that processes URLs in a continuous loop.
|
|
It is started by the `ytdlp_ops_orchestrator` (the "ignition system").
|
|
|
|
#### How it Works:
|
|
|
|
1. **Ignition:** An initial run is triggered by the orchestrator.
|
|
2. **Pull & Assign:** It pulls a URL from Redis and assigns an account for the job, reusing the last successful account if available (affinity).
|
|
3. **Get Token:** It calls the `ytdlp-ops-server` to get tokens and `info.json`. This step is encapsulated in a `TaskGroup` that handles a single retry on failure.
|
|
4. **Failure Handling:** If `get_token` fails with a "bannable" error (like bot detection), it follows the `on_bannable_failure` policy:
|
|
- `retry_with_new_account` (default): It bans the failing account, picks a new one, and retries the `get_token` call once. If the retry also fails, it bans the second account and the proxy, then stops the loop.
|
|
- `stop_loop`: It bans the account and stops the loop immediately.
|
|
5. **Download:** If tokens are retrieved successfully, it downloads the media.
|
|
6. **Continue or Stop:** After success, or a non-recoverable failure, it decides whether to continue the loop by re-triggering itself or to stop.
|
|
|
|
This creates a "processing lane" that runs independently until the queue is empty or a failure occurs.
|
|
""",
|
|
tags=['ytdlp', 'worker'],
|
|
params={
|
|
# Worker loop control params (passed from orchestrator)
|
|
'queue_name': Param(DEFAULT_QUEUE_NAME, type="string", description="Base name for Redis queues."),
|
|
'redis_conn_id': Param(DEFAULT_REDIS_CONN_ID, type="string", description="Airflow Redis connection ID."),
|
|
# Worker-specific params
|
|
'service_ip': Param(DEFAULT_YT_AUTH_SERVICE_IP, type="string", description="Service IP. Default is from Airflow variable YT_AUTH_SERVICE_IP or hardcoded."),
|
|
'service_port': Param(DEFAULT_YT_AUTH_SERVICE_PORT, type="integer", description="Port of the Envoy load balancer. Default is from Airflow variable YT_AUTH_SERVICE_PORT or hardcoded."),
|
|
'account_pool': Param('default_account', type="string", description="Account pool prefix or comma-separated list."),
|
|
'account_pool_size': Param(None, type=["integer", "null"], description="If using a prefix for 'account_pool', this specifies the number of accounts to generate (e.g., 10 for 'prefix_01' through 'prefix_10'). Required when using a prefix."),
|
|
'machine_id': Param(None, type=["string", "null"], description="Identifier for the client machine, used for proxy usage tracking. If not set, worker hostname will be used."),
|
|
'clients': Param('mweb', type="string", description="Comma-separated list of clients to use for token generation (e.g., 'ios,android,mweb')."),
|
|
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
|
'download_format': Param('ba[ext=m4a]/bestaudio/best', type="string", description="yt-dlp format selection string."),
|
|
'output_path_template': Param("%(title)s [%(id)s].%(ext)s", type="string", description="yt-dlp output filename template."),
|
|
'on_bannable_failure': Param(
|
|
'retry_with_new_account',
|
|
type="string",
|
|
enum=['stop_loop', 'retry_with_new_account'],
|
|
title="On Bannable Failure Policy",
|
|
description="Policy for when a bannable error occurs. 'stop_loop' or 'retry_with_new_account'."
|
|
),
|
|
'retry_on_probe_failure': Param(False, type="boolean", description="If True, attempts to re-download and probe a file if the initial probe fails."),
|
|
'auto_create_new_accounts_on_exhaustion': Param(True, type="boolean", description="If True and all accounts in a prefix-based pool are exhausted, create a new one automatically."),
|
|
'retrigger_delay_on_empty_s': Param(60, type="integer", description="Delay in seconds before re-triggering a worker if the queue is empty. Set to -1 to stop the loop."),
|
|
# --- Internal Worker Parameters (for self-triggering loop) ---
|
|
'current_account_id': Param(None, type=["string", "null"], description="[Internal] The account ID used by the previous run in this worker lane. Used to maintain account affinity."),
|
|
}
|
|
) as dag:
|
|
|
|
pull_url_and_assign_account = PythonOperator(
|
|
task_id='pull_url_and_assign_account',
|
|
python_callable=pull_url_and_assign_account_callable,
|
|
)
|
|
|
|
# --- Encapsulate token acquisition logic in a TaskGroup for visual clarity ---
|
|
with TaskGroup(group_id='acquire_token_with_retry') as acquire_token_group:
|
|
get_token = PythonOperator(
|
|
task_id='get_token',
|
|
python_callable=get_token_callable,
|
|
templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"},
|
|
)
|
|
|
|
handle_bannable_error_branch = BranchPythonOperator(
|
|
task_id='handle_bannable_error_branch',
|
|
python_callable=handle_bannable_error_branch_callable,
|
|
trigger_rule='one_failed', # This task should only run if get_token fails
|
|
)
|
|
|
|
# --- Retry Path ---
|
|
ban_account_and_prepare_for_retry = PythonOperator(
|
|
task_id='ban_account_and_prepare_for_retry',
|
|
python_callable=ban_first_account_callable,
|
|
)
|
|
|
|
assign_new_account_for_retry = PythonOperator(
|
|
task_id='assign_new_account_for_retry',
|
|
python_callable=assign_new_account_for_retry_callable,
|
|
)
|
|
|
|
retry_get_token = PythonOperator(
|
|
task_id='retry_get_token',
|
|
python_callable=get_token_callable,
|
|
templates_dict={'info_json_dir': "{{ dag_run.conf.get('info_json_dir', var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')) }}"},
|
|
)
|
|
|
|
handle_retry_failure_branch = BranchPythonOperator(
|
|
task_id='handle_retry_failure_branch',
|
|
python_callable=handle_retry_failure_branch_callable,
|
|
trigger_rule='one_failed', # This task should only run if retry_get_token fails
|
|
)
|
|
|
|
ban_second_account_and_proxy = PythonOperator(
|
|
task_id='ban_second_account_and_proxy',
|
|
python_callable=ban_second_account_and_proxy_callable,
|
|
)
|
|
|
|
# --- Stop Path ---
|
|
ban_account_and_fail = PythonOperator(
|
|
task_id='ban_account_and_fail',
|
|
python_callable=ban_first_account_and_fail_callable,
|
|
)
|
|
|
|
# --- Internal Success Merge Point ---
|
|
token_acquisition_succeeded = DummyOperator(
|
|
task_id='token_acquisition_succeeded',
|
|
trigger_rule='one_success',
|
|
)
|
|
|
|
# --- Define dependencies within the TaskGroup ---
|
|
# The success dummy task is the merge point for the two possible success tasks.
|
|
[get_token, retry_get_token] >> token_acquisition_succeeded
|
|
|
|
# The first branch operator runs only if get_token fails.
|
|
get_token >> handle_bannable_error_branch
|
|
# It branches to the retry path or the hard-fail path.
|
|
handle_bannable_error_branch >> [ban_account_and_prepare_for_retry, ban_account_and_fail]
|
|
|
|
# The retry path
|
|
ban_account_and_prepare_for_retry >> assign_new_account_for_retry >> retry_get_token
|
|
|
|
# The second branch operator runs only if retry_get_token fails.
|
|
retry_get_token >> handle_retry_failure_branch
|
|
# It only branches to the final failure task.
|
|
handle_retry_failure_branch >> ban_second_account_and_proxy
|
|
|
|
# --- Main Execution Path (outside the TaskGroup) ---
|
|
download_and_probe = BashOperator(
|
|
task_id='download_and_probe',
|
|
bash_command="""
|
|
set -e
|
|
|
|
INFO_JSON_PATH_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='info_json_path') }}"
|
|
INFO_JSON_PATH_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='info_json_path') }}"
|
|
INFO_JSON_PATH="${INFO_JSON_PATH_1:-$INFO_JSON_PATH_2}"
|
|
|
|
PROXY_1="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.get_token', key='socks_proxy') }}"
|
|
PROXY_2="{{ ti.xcom_pull(task_ids='acquire_token_with_retry.retry_get_token', key='socks_proxy') }}"
|
|
PROXY="${PROXY_1:-$PROXY_2}"
|
|
|
|
FORMAT="{{ params.download_format }}"
|
|
DOWNLOAD_DIR="{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles/video') }}"
|
|
FILENAME_TEMPLATE="{{ params.output_path_template }}"
|
|
FULL_OUTPUT_PATH="$DOWNLOAD_DIR/$FILENAME_TEMPLATE"
|
|
|
|
echo "--- Starting Download Step ---"
|
|
echo "Info JSON Path: $INFO_JSON_PATH"
|
|
echo "Proxy: $PROXY"
|
|
echo "Format: $FORMAT"
|
|
echo "Download Directory: $DOWNLOAD_DIR"
|
|
echo "Full Output Path: $FULL_OUTPUT_PATH"
|
|
|
|
if [ -z "$INFO_JSON_PATH" ] || [ "$INFO_JSON_PATH" == "None" ] || [ ! -f "$INFO_JSON_PATH" ]; then
|
|
echo "Error: info.json path is missing or file does not exist ($INFO_JSON_PATH)."
|
|
exit 1
|
|
fi
|
|
|
|
CMD_ARRAY=(yt-dlp --load-info-json "$INFO_JSON_PATH")
|
|
if [ -n "$PROXY" ] && [ "$PROXY" != "None" ]; then
|
|
CMD_ARRAY+=(--proxy "$PROXY")
|
|
fi
|
|
CMD_ARRAY+=(-f "$FORMAT" -o "$FULL_OUTPUT_PATH" --print filename)
|
|
CMD_ARRAY+=(--continue --no-progress --no-simulate --no-write-info-json --ignore-errors --no-playlist)
|
|
|
|
echo "Executing: $(printf "%q " "${CMD_ARRAY[@]}")"
|
|
|
|
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
|
|
EXIT_CODE=$?
|
|
|
|
echo "yt-dlp exited with code: $EXIT_CODE"
|
|
|
|
if [ $EXIT_CODE -ne 0 ]; then
|
|
echo "Error: yt-dlp command failed."
|
|
exit $EXIT_CODE
|
|
fi
|
|
if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
|
|
echo "Error: Download failed or did not produce a file."
|
|
exit 1
|
|
fi
|
|
echo "SUCCESS: Download complete. Final file at: $FINAL_FILENAME"
|
|
|
|
echo "--- Starting Probe Step ---"
|
|
echo "Probing downloaded file: $FINAL_FILENAME"
|
|
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
|
|
echo "Error: ffmpeg probe check failed for '$FINAL_FILENAME'. The file might be corrupt."
|
|
|
|
if [ "{{ params.retry_on_probe_failure }}" == "True" ]; then
|
|
echo "Attempting one retry on probe failure..."
|
|
echo "Renaming to .part to attempt resuming download."
|
|
mv -f "$FINAL_FILENAME" "$FINAL_FILENAME.part"
|
|
|
|
# Re-run download command
|
|
echo "Re-executing: $(printf "%q " "${CMD_ARRAY[@]}")"
|
|
FINAL_FILENAME=$("${CMD_ARRAY[@]}")
|
|
EXIT_CODE=$?
|
|
echo "yt-dlp retry exited with code: $EXIT_CODE"
|
|
|
|
if [ $EXIT_CODE -ne 0 ]; then
|
|
echo "Error: yt-dlp retry command failed."
|
|
exit $EXIT_CODE
|
|
fi
|
|
if [ -z "$FINAL_FILENAME" ] || [ ! -f "$FINAL_FILENAME" ]; then
|
|
echo "Error: Retry download failed or did not produce a file."
|
|
exit 1
|
|
fi
|
|
echo "SUCCESS: Retry download complete. Final file at: $FINAL_FILENAME"
|
|
|
|
# Re-probe
|
|
echo "Probing redownloaded file: $FINAL_FILENAME"
|
|
if ! ffmpeg -v error -i "$FINAL_FILENAME" -f null - ; then
|
|
echo "Error: ffmpeg probe check failed again for '$FINAL_FILENAME'. Failing with exit code 2."
|
|
exit 2
|
|
fi
|
|
else
|
|
echo "Failing with exit code 2 due to probe failure (retries disabled)."
|
|
exit 2
|
|
fi
|
|
fi
|
|
echo "SUCCESS: Probe confirmed valid media file."
|
|
|
|
# Push the final filename for the success_task
|
|
echo "$FINAL_FILENAME"
|
|
""",
|
|
retries=0,
|
|
retry_delay=timedelta(minutes=1),
|
|
)
|
|
|
|
# --- Finalization Tasks ---
|
|
mark_url_as_success = PythonOperator(
|
|
task_id='mark_url_as_success',
|
|
python_callable=mark_url_as_success,
|
|
)
|
|
|
|
handle_generic_failure = PythonOperator(
|
|
task_id='handle_generic_failure',
|
|
python_callable=handle_failure_callable,
|
|
trigger_rule='one_failed', # Trigger if any upstream in the failure path fails
|
|
)
|
|
|
|
decide_next_step = BranchPythonOperator(
|
|
task_id='decide_what_to_do_next',
|
|
python_callable=decide_what_to_do_next_callable,
|
|
trigger_rule='all_done',
|
|
)
|
|
|
|
continue_loop_and_trigger_next_run = PythonOperator(
|
|
task_id='continue_loop_and_trigger_next_run',
|
|
python_callable=trigger_self_run_callable,
|
|
)
|
|
|
|
stop_worker_lane_gracefully = DummyOperator(task_id='stop_worker_lane_gracefully')
|
|
mark_dag_run_as_failed = BashOperator(task_id='mark_dag_run_as_failed', bash_command='exit 1')
|
|
|
|
# --- Define Task Dependencies ---
|
|
pull_url_and_assign_account >> acquire_token_group
|
|
|
|
# The TaskGroup's internal success task (`token_acquisition_succeeded`) is the trigger for the download.
|
|
# This is more explicit than depending on the entire group's state and prevents the skip issue.
|
|
dag.get_task('acquire_token_with_retry.token_acquisition_succeeded') >> download_and_probe
|
|
|
|
download_and_probe >> mark_url_as_success
|
|
|
|
# Define the failure path. The generic failure handler is set downstream of the two
|
|
# main tasks that can fail. Its 'one_failed' trigger rule ensures it only runs on failure.
|
|
# This explicit list avoids potential scheduler ambiguity.
|
|
[acquire_token_group, download_and_probe] >> handle_generic_failure
|
|
|
|
# Define the final decision point. This task must run after the success path completes
|
|
# OR after the failure path completes. Its 'all_done' trigger rule makes this possible.
|
|
mark_url_as_success >> decide_next_step
|
|
handle_generic_failure >> decide_next_step
|
|
|
|
decide_next_step >> [continue_loop_and_trigger_next_run, stop_worker_lane_gracefully, mark_dag_run_as_failed] |