344 lines
14 KiB
Python
344 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vim:fenc=utf-8
|
|
#
|
|
# Copyright © 2024 rl <rl@rlmbp>
|
|
#
|
|
# Distributed under terms of the MIT license.
|
|
|
|
"""
|
|
DAG for processing a single YouTube URL passed via DAG run configuration.
|
|
This is the "Worker" part of a Sensor/Worker pattern.
|
|
This DAG has been refactored to use the TaskFlow API to implement worker affinity,
|
|
ensuring all tasks for a single URL run on the same machine.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from airflow.decorators import task, task_group
|
|
from airflow.exceptions import AirflowException, AirflowSkipException
|
|
from airflow.models import Variable
|
|
from airflow.models.dag import DAG
|
|
from airflow.models.param import Param
|
|
from airflow.models.xcom_arg import XComArg
|
|
from airflow.operators.dummy import DummyOperator
|
|
from airflow.operators.bash import BashOperator
|
|
from airflow.utils.dates import days_ago
|
|
from airflow.api.common.trigger_dag import trigger_dag
|
|
from datetime import timedelta, datetime
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import socket
|
|
import time
|
|
import traceback
|
|
import uuid
|
|
import subprocess
|
|
import shlex
|
|
|
|
# Import utility functions and Thrift modules
|
|
from utils.redis_utils import _get_redis_client
|
|
|
|
# Handle potential import issues with Thrift modules
|
|
try:
|
|
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
except ImportError as e:
|
|
logging.warning(f"Could not import TokenUpdateMode from pangramia.yt.common.ttypes: {e}")
|
|
TokenUpdateMode = None
|
|
|
|
try:
|
|
from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException
|
|
except ImportError as e:
|
|
logging.warning(f"Could not import PBServiceException/PBUserException from pangramia.yt.exceptions.ttypes: {e}")
|
|
PBServiceException = Exception
|
|
PBUserException = Exception
|
|
|
|
try:
|
|
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
except ImportError as e:
|
|
logging.warning(f"Could not import YTTokenOpService from pangramia.yt.tokens_ops: {e}")
|
|
YTTokenOpService = None
|
|
|
|
try:
|
|
from thrift.protocol import TBinaryProtocol
|
|
from thrift.transport import TSocket, TTransport
|
|
from thrift.transport.TTransport import TTransportException
|
|
except ImportError as e:
|
|
logging.warning(f"Could not import thrift modules: {e}")
|
|
TBinaryProtocol = None
|
|
TSocket = None
|
|
TTransport = None
|
|
TTransportException = Exception
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default settings from Airflow Variables or hardcoded fallbacks
|
|
DEFAULT_QUEUE_NAME = 'video_queue'
|
|
DEFAULT_REDIS_CONN_ID = 'redis_default'
|
|
DEFAULT_TIMEOUT = 3600
|
|
DEFAULT_YT_AUTH_SERVICE_IP = Variable.get("YT_AUTH_SERVICE_IP", default_var="172.17.0.1")
|
|
DEFAULT_YT_AUTH_SERVICE_PORT = Variable.get("YT_AUTH_SERVICE_PORT", default_var=9080)
|
|
|
|
# The queue is set to a fallback here. The actual worker-specific queue is
|
|
# assigned just-in-time by the task_instance_mutation_hook in airflow_local_settings.py,
|
|
# which reads the 'worker_queue' from the DAG run configuration.
|
|
DEFAULT_ARGS = {
|
|
'owner': 'airflow',
|
|
'retries': 0,
|
|
'queue': 'queue-dl', # Fallback queue. Will be overridden by the policy hook.
|
|
}
|
|
|
|
|
|
# --- Helper Functions ---
|
|
|
|
def _get_thrift_client(host, port, timeout):
|
|
"""Helper to create and connect a Thrift client."""
|
|
if not TSocket or not TTransport or not TBinaryProtocol:
|
|
raise AirflowException("Required Thrift modules are not available")
|
|
|
|
transport = TSocket.TSocket(host, port)
|
|
transport.setTimeout(timeout * 1000)
|
|
transport = TTransport.TFramedTransport(transport)
|
|
protocol = TBinaryProtocol.TBinaryProtocolFactory()
|
|
client = YTTokenOpService.Client(protocol) if YTTokenOpService else None
|
|
if client:
|
|
transport.open()
|
|
logger.info(f"Connected to Thrift server at {host}:{port}")
|
|
return client, transport
|
|
|
|
def _extract_video_id(url):
|
|
"""Extracts YouTube video ID from URL."""
|
|
if not url or not isinstance(url, str):
|
|
return None
|
|
patterns = [r'v=([a-zA-Z0-9_-]{11})', r'youtu\.be/([a-zA-Z0-9_-]{11})']
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _get_account_pool(params: dict) -> list:
|
|
"""
|
|
Gets the list of accounts to use for processing, filtering out banned/resting accounts.
|
|
Supports explicit list, prefix-based generation, and single account modes.
|
|
"""
|
|
account_pool_str = params.get('account_pool', 'default_account')
|
|
accounts = []
|
|
is_prefix_mode = False
|
|
|
|
if ',' in account_pool_str:
|
|
accounts = [acc.strip() for acc in account_pool_str.split(',') if acc.strip()]
|
|
else:
|
|
prefix = account_pool_str
|
|
pool_size_param = params.get('account_pool_size')
|
|
if pool_size_param is not None:
|
|
is_prefix_mode = True
|
|
pool_size = int(pool_size_param)
|
|
accounts = [f"{prefix}_{i:02d}" for i in range(1, pool_size + 1)]
|
|
else:
|
|
accounts = [prefix]
|
|
|
|
if not accounts:
|
|
raise AirflowException("Initial account pool is empty.")
|
|
|
|
redis_conn_id = params.get('redis_conn_id', DEFAULT_REDIS_CONN_ID)
|
|
try:
|
|
redis_client = _get_redis_client(redis_conn_id)
|
|
active_accounts = []
|
|
for account in accounts:
|
|
status_bytes = redis_client.hget(f"account_status:{account}", "status")
|
|
status = status_bytes.decode('utf-8') if status_bytes else "ACTIVE"
|
|
if status not in ['BANNED'] and 'RESTING' not in status:
|
|
active_accounts.append(account)
|
|
|
|
if not active_accounts and accounts:
|
|
auto_create = params.get('auto_create_new_accounts_on_exhaustion', False)
|
|
if auto_create and is_prefix_mode:
|
|
new_account_id = f"{account_pool_str}-auto-{str(uuid.uuid4())[:8]}"
|
|
logger.warning(f"Account pool exhausted. Auto-creating new account: '{new_account_id}'")
|
|
active_accounts.append(new_account_id)
|
|
else:
|
|
raise AirflowException("All accounts in the configured pool are currently exhausted.")
|
|
accounts = active_accounts
|
|
except Exception as e:
|
|
logger.error(f"Could not filter accounts from Redis. Using unfiltered pool. Error: {e}", exc_info=True)
|
|
|
|
if not accounts:
|
|
raise AirflowException("Account pool is empty after filtering.")
|
|
|
|
logger.info(f"Final active account pool with {len(accounts)} accounts.")
|
|
return accounts
|
|
|
|
# =============================================================================
|
|
# TASK DEFINITIONS (TaskFlow API)
|
|
# =============================================================================
|
|
|
|
@task
|
|
def get_url_and_assign_account(**context):
|
|
"""
|
|
Gets the URL to process from the DAG run configuration and assigns an active account.
|
|
This is the first task in the pinned-worker DAG.
|
|
"""
|
|
params = context['params']
|
|
|
|
# Update yt-dlp to latest nightly before every run
|
|
subprocess.run(["/usr/local/bin/update-yt-dlp.sh"], check=True)
|
|
|
|
# The URL is passed by the dispatcher DAG.
|
|
url_to_process = params.get('url_to_process')
|
|
if not url_to_process:
|
|
raise AirflowException("'url_to_process' was not found in the DAG run configuration.")
|
|
logger.info(f"Received URL '{url_to_process}' to process.")
|
|
|
|
# Account assignment logic is the same as before.
|
|
account_id = random.choice(_get_account_pool(params))
|
|
logger.info(f"Selected account '{account_id}' for this run.")
|
|
|
|
return {
|
|
'url_to_process': url_to_process,
|
|
'account_id': account_id,
|
|
'accounts_tried': [account_id],
|
|
}
|
|
|
|
@task
|
|
def get_token(initial_data: dict, **context):
|
|
"""Makes a single attempt to get a token from the Thrift service."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
|
|
account_id = initial_data['account_id']
|
|
url = initial_data['url_to_process']
|
|
info_json_dir = Variable.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles')
|
|
|
|
host, port, timeout = params['service_ip'], int(params['service_port']), int(params.get('timeout', DEFAULT_TIMEOUT))
|
|
machine_id = params.get('machine_id') or socket.gethostname()
|
|
|
|
logger.info(f"--- Attempting to get token for URL '{url}' with account '{account_id}' ---")
|
|
client, transport = None, None
|
|
try:
|
|
client, transport = _get_thrift_client(host, port, timeout)
|
|
if not client or not TokenUpdateMode:
|
|
raise AirflowException("Thrift client or TokenUpdateMode not available")
|
|
|
|
token_data = client.getOrRefreshToken(accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url, clients=params.get('clients'), machineId=machine_id)
|
|
|
|
info_json = getattr(token_data, 'infoJson', None)
|
|
if not (info_json and json.loads(info_json)):
|
|
raise AirflowException("Service returned success but info.json was empty or invalid.")
|
|
|
|
video_id = _extract_video_id(url)
|
|
os.makedirs(info_json_dir, exist_ok=True)
|
|
# Use a readable timestamp for a unique filename on each attempt.
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
info_json_path = os.path.join(info_json_dir, f"info_{video_id or 'unknown'}_{account_id}_{timestamp}.json")
|
|
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
f.write(info_json)
|
|
|
|
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
return {
|
|
'info_json_path': info_json_path,
|
|
'socks_proxy': getattr(token_data, proxy_attr) if proxy_attr else None,
|
|
'ytdlp_command': getattr(token_data, proxy_attr) if proxy_attr else None,
|
|
'successful_account_id': account_id,
|
|
'original_url': url, # Include original URL for fallback
|
|
}
|
|
except (PBServiceException, PBUserException, TTransportException) as e:
|
|
error_context = getattr(e, 'context', None)
|
|
if isinstance(error_context, str):
|
|
try: error_context = json.loads(error_context.replace("'", "\""))
|
|
except: pass
|
|
|
|
error_details = {
|
|
'error_message': getattr(e, 'message', str(e)),
|
|
'error_code': getattr(e, 'errorCode', 'TRANSPORT_ERROR'),
|
|
'proxy_url': error_context.get('proxy_url') if isinstance(error_context, dict) else None
|
|
}
|
|
logger.error(f"Thrift call failed for account '{account_id}'. Exception: {error_details['error_message']}")
|
|
ti.xcom_push(key='error_details', value=error_details)
|
|
|
|
# If it's not a connection error, run diagnostic yt-dlp command
|
|
if error_details['error_code'] not in ["SOCKS5_CONNECTION_FAILED", "SOCKET_TIMEOUT", "TRANSPORT_ERROR", "CAMOUFOX_TIMEOUT"]:
|
|
_run_diagnostic_yt_dlp(url, error_details.get('proxy_url'), params.get('clients', 'web'))
|
|
|
|
raise AirflowException(f"Thrift call failed: {error_details['error_message']}")
|
|
finally:
|
|
if transport and transport.isOpen():
|
|
transport.close()
|
|
|
|
def _run_diagnostic_yt_dlp(url, proxy, clients):
|
|
"""Runs yt-dlp with diagnostic flags to capture failed responses."""
|
|
logger.warning("Running diagnostic yt-dlp command to capture failed response...")
|
|
|
|
dump_dir = "/opt/airflow/dumps"
|
|
os.makedirs(dump_dir, exist_ok=True)
|
|
|
|
video_id = _extract_video_id(url)
|
|
dump_file = os.path.join(dump_dir, f"diagnostic_{video_id}_{int(time.time())}.dump")
|
|
|
|
cmd = [
|
|
'yt-dlp',
|
|
'--extractor-args', f'youtube:player-client={clients}',
|
|
'--write-pages',
|
|
'--proxy', proxy or '',
|
|
'-FvU',
|
|
url,
|
|
'--write-info-json',
|
|
'--print', 'filename',
|
|
'--continue',
|
|
'--no-progress',
|
|
'--no-simulate',
|
|
'--ignore-errors',
|
|
'--no-playlist'
|
|
]
|
|
|
|
logger.info(f"Executing diagnostic command: {' '.join(shlex.quote(arg) for arg in cmd)}")
|
|
logger.info(f"Diagnostic dump will be saved to: {dump_file}")
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
|
logger.info(f"Diagnostic yt-dlp exit code: {result.returncode}")
|
|
if result.stdout:
|
|
logger.info(f"Diagnostic output:\n{result.stdout}")
|
|
if result.stderr:
|
|
logger.error(f"Diagnostic stderr:\n{result.stderr}")
|
|
except subprocess.TimeoutExpired:
|
|
logger.error("Diagnostic yt-dlp command timed out after 5 minutes")
|
|
except Exception as e:
|
|
logger.error(f"Failed to run diagnostic yt-dlp: {e}")
|
|
|
|
@task.branch
|
|
def handle_bannable_error_branch(task_id_to_check: str, **context):
|
|
"""Inspects a failed task and routes to retry logic if the error is bannable."""
|
|
ti = context['task_instance']
|
|
params = context['params']
|
|
error_details = ti.xcom_pull(task_ids=task_id_to_check, key='error_details')
|
|
if not error_details:
|
|
return None # Let DAG fail for unexpected errors
|
|
|
|
error_code = error_details.get('error_code', '').strip()
|
|
policy = params.get('on_bannable_failure', 'retry_with_new_account')
|
|
|
|
# Connection errors should be retried without banning the account.
|
|
connection_errors = ['SOCKS5_CONNECTION_FAILED', 'SOCKET_TIMEOUT', 'TRANSPORT_ERROR', 'CAMOUFOX_TIMEOUT']
|
|
if error_code in connection_errors:
|
|
logger.info(f"Handling connection error '{error_code}' from '{task_id_to_check}'. Policy: '{policy}'")
|
|
if policy == 'stop_loop':
|
|
logger.warning(f"Connection error with 'stop_loop' policy. Failing DAG without banning.")
|
|
return None
|
|
else:
|
|
logger.info("Retrying with a new account without banning.")
|
|
return 'assign_new_account_for_retry'
|
|
|
|
is_bannable = error_code in ["BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED"]
|
|
|
|
logger.info(f"Handling failure from '{task_id_to_check}'. Error code: '{error_code}', Policy: '{policy}'")
|
|
if is_bannable and policy in ['retry_with_new_account', 'retry_and_ban_account_only']:
|
|
return 'ban_account_and_prepare_for_retry'
|
|
if is_bannable and policy in ['retry_on_connection_error', 'retry_without_ban']:
|
|
return 'assign_new_account_for_retry'
|
|
if is_bannable: # stop_loop
|
|
return 'ban_and_fail'
|
|
return None # Not a bannable error, let DAG fail
|