from airflow import DAG from airflow.models import BaseOperator, Variable from airflow.utils.decorators import apply_defaults from airflow.hooks.base import BaseHook from airflow.exceptions import AirflowException from airflow.utils.dates import days_ago from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol from thrift.transport.TTransport import TTransportException from datetime import datetime, timedelta from pangramia.yt.exceptions.ttypes import PBServiceException import redis import logging import time import socket import json import os from pangramia.yt.tokens_ops import YTTokenOpService from pangramia.yt.common.ttypes import TokenUpdateMode from airflow.providers.redis.hooks.redis import RedisHook from airflow.operators.python import PythonOperator from airflow.models.param import Param # Assuming ytdlp_utils exists in the same directory or PYTHONPATH # from ytdlp_utils import get_info_json, is_valid_json, extract_video_id # Configure logging logger = logging.getLogger(__name__) # Default settings (similar to ytdlp_client_dag.py) MAX_RETRIES = 1 RETRY_DELAY = timedelta(seconds=10) DEFAULT_TIMEOUT = 30 class YtdlpOpsOperator(BaseOperator): """ Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections and Redis-based discovery, retrieves tokens, saves info.json, and manages errors. """ template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir') @apply_defaults def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10, service_ip=None, service_port=None, redis_enabled=False, account_id=None, save_info_json=True, info_json_dir=None, get_socks_proxy=True, store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs): super().__init__(*args, **kwargs) logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, " f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, " f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, " f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, " f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}") # Validate required parameters if not url: raise ValueError("url is required") # Validate parameters based on connection mode if redis_enabled: if not account_id: raise ValueError("account_id is required when redis_enabled=True") # Use default Redis connection if not specified if not redis_conn_id: redis_conn_id = 'redis_default' logger.info(f"Using default Redis connection ID: {redis_conn_id}") else: if not service_ip or not service_port: raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False") if not account_id: logger.warning("No account_id provided for direct connection mode. Using 'default'") account_id = 'default' # Assign default if missing in direct mode self.url = url self.redis_conn_id = redis_conn_id self.max_retries = max_retries self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay) self.service_ip = service_ip self.service_port = service_port self.redis_enabled = redis_enabled self.account_id = account_id self.save_info_json = save_info_json self.info_json_dir = info_json_dir self.get_socks_proxy = get_socks_proxy self.store_socks_proxy = store_socks_proxy self.timeout = timeout def execute(self, context): logger.info("Executing YtdlpOpsOperator") transport = None try: logger.info("Getting task parameters") params = context.get('params', {}) redis_enabled = params.get('redis_enabled', self.redis_enabled) logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})") # Determine account_id to use (from params or operator default) account_id = context['params'].get('account_id', self.account_id) logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})") if redis_enabled: # Get Redis connection with proper authentication and error handling redis_conn = BaseHook.get_connection(self.redis_conn_id) redis_client = redis.Redis( host=redis_conn.host, port=redis_conn.port, password=redis_conn.password, db=0, decode_responses=True # Important for consistent key handling ) # Test Redis connection try: if not redis_client.ping(): raise redis.exceptions.ConnectionError("Redis ping failed") logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}") except redis.exceptions.AuthenticationError: logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.") raise AirflowException("Redis authentication failed.") except redis.exceptions.ConnectionError as e: logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}") raise AirflowException(f"Redis connection failed: {e}") except Exception as e: logger.error(f"Unexpected Redis error: {str(e)}") raise AirflowException(f"Unexpected Redis error: {e}") # Get service details from Redis with retries and proper key handling service_key = f"ytdlp:{account_id}" legacy_key = account_id # For backward compatibility host = None port = None for attempt in range(self.max_retries): try: logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'") service_details = redis_client.hgetall(service_key) if not service_details: logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'") service_details = redis_client.hgetall(legacy_key) if not service_details: raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}") # Find IP and port, handling potential case differences and byte/string types ip_key = next((k for k in service_details if k.lower() == 'ip'), None) port_key = next((k for k in service_details if k.lower() == 'port'), None) if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}") if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}") host = service_details[ip_key] # Already decoded due to decode_responses=True port_str = service_details[port_key] try: port = int(port_str) except ValueError: raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}") logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}") break # Success except Exception as e: logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}") if attempt == self.max_retries - 1: logger.error("Max retries reached for fetching Redis details.") raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}") logger.info(f"Retrying in {self.retry_delay} seconds...") time.sleep(self.retry_delay) else: # Direct connection: Log parameter sources params = context.get('params', {}) host = params.get('service_ip', self.service_ip) host_source = 'task params' if 'service_ip' in params else 'operator init' port_str = params.get('service_port', self.service_port) port_source = 'task params' if 'service_port' in params else 'operator init' url = params.get('url', self.url) url_source = 'task params' if 'url' in params else 'operator init' logger.info(f"Using service_ip={host} (from {host_source})") logger.info(f"Using service_port={port_str} (from {port_source})") logger.info(f"Using url={url} (from {url_source})") if not host or not port_str: raise ValueError("Direct connection requires service_ip and service_port") try: port = int(port_str) except ValueError: raise ValueError(f"Invalid service_port value: {port_str}") logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)") # Render and validate timeout timeout_param = context.get('params', {}).get('timeout', self.timeout) if isinstance(self.timeout, str) and '{{' in self.timeout: timeout_rendered = self.render_template(self.timeout, context) logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'") timeout_param = timeout_rendered try: timeout = int(timeout_param) if timeout <= 0: raise ValueError("Timeout must be positive") logger.info(f"Using timeout: {timeout} seconds") except (ValueError, TypeError): logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}") timeout = DEFAULT_TIMEOUT # Create Thrift connection objects socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4) socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds transport = TTransport.TFramedTransport(socket_conn) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = YTTokenOpService.Client(protocol) logger.info(f"Attempting to connect to Thrift server at {host}:{port}...") try: transport.open() logger.info("Successfully connected to Thrift server.") # Test connection with ping try: client.ping() logger.info("Server ping successful.") except Exception as e: logger.error(f"Server ping failed: {e}") raise AirflowException(f"Server connection test (ping) failed: {e}") # Get token from service with specific error handling try: url_param = context.get('params', {}).get('url', self.url) logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'") token_data = client.getOrRefreshToken( accountId=account_id, updateType=TokenUpdateMode.AUTO, url=url_param ) logger.info("Successfully retrieved token data from service.") except PBServiceException as e: logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}") error_code = getattr(e, 'errorCode', None) error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}" # Handle specific known error codes if error_code in [ "SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT", "SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT", "SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE" ]: error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings." elif error_code == "BOT_DETECTION": error_msg = f"Bot detection triggered ({error_code}): {e.message}." suggestions = getattr(e, 'context', {}).get('suggestions', []) if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions) elif error_code == "NODEJS_SCRIPT_ERROR": error_msg = f"Node.js script error ({error_code}): {e.message}." elif error_code == "NODEJS_TIMEOUT": error_msg = f"Node.js timeout ({error_code}): {e.message}." # Add more specific error handling as needed raise AirflowException(error_msg) except TTransportException as e: logger.error(f"Thrift transport error during getOrRefreshToken: {e}") raise AirflowException(f"Transport error during API call: {e}") except Exception as e: logger.error(f"Unexpected error during getOrRefreshToken: {e}") raise AirflowException(f"Unexpected error during API call: {e}") except TTransportException as e: # Handle connection-specific transport errors if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e): logger.error(f"Connection failed to {host}:{port}. Details: {e}") logger.error("Possible causes: Server down, firewall block, incorrect IP/port.") raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}") else: logger.error(f"Thrift transport error during connection: {str(e)}") raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}") except Exception as e: logger.error(f"Unexpected error during connection or ping: {str(e)}") raise # Re-raise other unexpected errors # Log received token data attributes for debugging logger.debug(f"Token data received. Attributes: {dir(token_data)}") for attr in dir(token_data): if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes value = getattr(token_data, attr) if attr == 'infoJson' and value: logger.debug(f"infoJson: {value[:50]}...") else: logger.debug(f"{attr}: {value}") info_json_path = None # Initialize info_json_path save_info_json_param = context['params'].get('save_info_json', self.save_info_json) # Render if it's a string template if isinstance(save_info_json_param, str): save_info_json_rendered = self.render_template(save_info_json_param, context) # Convert common string representations to boolean save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes'] else: save_info_json = bool(save_info_json_param) # Save info.json if requested and valid if self.save_info_json: info_json = self._get_info_json(token_data) if info_json and self._is_valid_json(info_json): try: # Use internal _save_info_json method which handles rendering, dir creation, logging info_json_path = self._save_info_json(context, info_json) if info_json_path: # Check if saving was successful context['task_instance'].xcom_push(key='info_json_path', value=info_json_path) logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}") else: # _save_info_json should log errors, push None to indicate failure context['task_instance'].xcom_push(key='info_json_path', value=None) logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.") except Exception as e: logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True) context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error elif info_json: logger.warning("Retrieved infoJson is not valid JSON. Skipping save.") context['task_instance'].xcom_push(key='info_json_path', value=None) else: logger.info("No infoJson found in token data. Skipping save.") context['task_instance'].xcom_push(key='info_json_path', value=None) else: logger.info("save_info_json is False. Skipping info.json save.") context['task_instance'].xcom_push(key='info_json_path', value=None) # Extract and potentially store SOCKS proxy socks_proxy = None if self.get_socks_proxy: # Use instance attribute # Check for common attribute names for proxy proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None) if proxy_attr: socks_proxy = getattr(token_data, proxy_attr) if socks_proxy: # Ensure proxy value is not empty logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}") if self.store_socks_proxy: # Use instance attribute context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy) logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}") else: logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).") else: logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.") # Push None even if found but empty, if storing is enabled if self.store_socks_proxy: # Use instance attribute context['task_instance'].xcom_push(key='socks_proxy', value=None) logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.") else: logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.") # Push None if storing is enabled but attribute not found if self.store_socks_proxy: # Use instance attribute context['task_instance'].xcom_push(key='socks_proxy', value=None) logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.") else: logger.info("get_socks_proxy is False. Skipping proxy extraction.") # Push None if storing is enabled but extraction was skipped if self.store_socks_proxy: # Use instance attribute context['task_instance'].xcom_push(key='socks_proxy', value=None) logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.") # Get the original command from the server ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None) if not ytdlp_cmd: logger.error("No 'ytdlpCommand' attribute found in token data.") raise AirflowException("Required 'ytdlpCommand' not received from service.") logger.info(f"Original command received from server: {ytdlp_cmd}") # Log example usage command (DO NOT MODIFY the original command here) if info_json_path: # Use double quotes for paths/proxy in example for robustness example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\"" if socks_proxy: example_cmd += f" --proxy \"{socks_proxy}\"" example_cmd += " --verbose --simulate" # Add useful flags for testing logger.info(f"\n--- Example usage with saved info.json ---") logger.info(example_cmd) logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')") latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json') logger.info(f"(You can also use 'latest.json': {latest_json_path})") logger.info(f"-------------------------------------------\n") else: logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---") if socks_proxy: logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.") logger.info("Add --verbose and --simulate flags for testing the command.") logger.info(f"-------------------------------------------------------\n") # Push the *original* command to XCom context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd) logger.info(f"Pushed original command to XCom key 'ytdlp_command'.") # Note: Returning ytdlp_cmd below implicitly pushes the same value # to XCom under the key 'return_value'. Downstream tasks should # preferably use the explicitly pushed 'ytdlp_command' key for clarity. return ytdlp_cmd # Return the original command except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above logger.error(f"Operation failed due to AirflowException: {e}") raise # Re-raise AirflowExceptions to ensure task failure except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException except Exception as e: # General catch-all for truly unexpected errors # Log with traceback for unexpected errors logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True) # Ensure any unexpected error explicitly fails the task with AirflowException raise AirflowException(f"Unexpected error caused task failure: {e}") finally: if transport and transport.isOpen(): # Check if transport exists and is open before closing logger.info("Closing Thrift transport.") transport.close() # --- Helper Methods --- def _get_info_json(self, token_data): """Safely extracts infoJson from token data.""" info_json = getattr(token_data, 'infoJson', None) if info_json: logger.debug("Extracted infoJson from token data.") else: logger.debug("No infoJson attribute found in token data.") return info_json def _is_valid_json(self, json_str): """Checks if a string is valid JSON.""" if not json_str or not isinstance(json_str, str): logger.debug("Input is not a non-empty string, considered invalid JSON.") return False try: json.loads(json_str) logger.debug("JSON string validation successful.") return True except json.JSONDecodeError as e: logger.warning(f"JSON validation failed: {e}") return False def _save_info_json(self, context, info_json): """Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure.""" try: # Get URL from params/context for video ID extraction url_param = context.get('params', {}).get('url', self.url) video_id = self._extract_video_id(url_param) # Use internal helper # Render the info_json_dir template save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string save_dir = self.render_template(save_dir_template, context) if not save_dir: # Handle case where template renders to empty string logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'") save_dir = "." logger.info(f"Target directory for info.json (rendered): {save_dir}") # Ensure directory exists try: os.makedirs(save_dir, exist_ok=True) logger.info(f"Ensured directory exists: {save_dir}") except OSError as e: logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.") return None # Indicate failure # Construct filename (using potentially overridden account_id) account_id_param = context.get('params', {}).get('account_id', self.account_id) timestamp = int(time.time()) base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json" info_json_path = os.path.join(save_dir, base_filename) latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy # Write to timestamped file try: logger.info(f"Writing info.json content (received from service) to {info_json_path}...") with open(info_json_path, 'w', encoding='utf-8') as f: f.write(info_json) logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}") except IOError as e: logger.error(f"Failed to write info.json to {info_json_path}: {e}") return None # Indicate failure # Write to latest.json (overwrite) - best effort try: with open(latest_json_path, 'w', encoding='utf-8') as f: f.write(info_json) logger.info(f"Updated latest.json file: {latest_json_path}") except IOError as e: # Log warning but don't fail the whole save if only latest.json fails logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}") return info_json_path # Return path on success (even if latest.json failed) except Exception as e: logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True) return None # Indicate failure def _extract_video_id(self, url): """Extracts YouTube video ID from URL (internal helper).""" if not url or not isinstance(url, str): logger.debug("URL is empty or not a string, cannot extract video ID.") return None try: # Basic extraction logic (can be enhanced for more URL types) video_id = None if 'youtube.com/watch?v=' in url: video_id = url.split('v=')[1].split('&')[0] elif 'youtu.be/' in url: video_id = url.split('youtu.be/')[1].split('?')[0] # Ensure it looks like a video ID (typically 11 chars, but can vary) if video_id and len(video_id) >= 11: video_id = video_id[:11] # Take first 11 chars as standard ID length logger.debug(f"Extracted video ID '{video_id}' from URL: {url}") return video_id else: logger.debug(f"Could not extract a standard video ID pattern from URL: {url}") return None except Exception as e: logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}") return None # ============================================================================= # Python Callables for Tasks # ============================================================================= def display_token_info(**context): """Displays token info from XCom, parses info.json, and logs example commands.""" ti = context['task_instance'] logger.info("Starting display_token_info task.") # Pull data from XCom (provide default values) info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') logger.info("\n=== Pulled Token Information from XCom ===") logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}") logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}") logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}") result = { 'info_path': info_json_path, 'proxy': socks_proxy, 'ytdlp_command': ytdlp_command, 'video_info': None, 'commands': {}, 'error': None } if info_json_path and os.path.exists(info_json_path): logger.info(f"\n=== Processing Video Information from: {info_json_path} ===") try: with open(info_json_path, 'r', encoding='utf-8') as f: info = json.load(f) # Extract and log basic video info safely title = info.get('title', 'Unknown Title') uploader = info.get('uploader', 'Unknown Author') duration = info.get('duration_string', 'Unknown Length') upload_date_str = info.get('upload_date') # Format: YYYYMMDD upload_date_formatted = 'Unknown Date' if upload_date_str: try: # Validate format before parsing if len(upload_date_str) == 8 and upload_date_str.isdigit(): upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d') else: logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.") except ValueError: logger.warning(f"Could not parse upload_date '{upload_date_str}'") result['video_info'] = { 'title': title, 'uploader': uploader, 'upload_date': upload_date_formatted, # Store formatted date 'duration': duration } logger.info(f"Title: {title}") logger.info(f"Author: {uploader}") logger.info(f"Date: {upload_date_formatted}") logger.info(f"Length: {duration}") logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===") base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\"" if socks_proxy: base_cmd += f" --proxy \"{socks_proxy}\"" # Command to list formats format_cmd = f"{base_cmd} -F" result['commands']['format'] = format_cmd logger.info(f"List formats command: {format_cmd}") # Execute and log the format listing command logger.info("\n--- Executing Format List Command ---") try: # Use os.popen for simplicity, capture output logger.info(f"Running: {format_cmd}") format_output = os.popen(format_cmd).read() logger.info("--- Format List Output ---") logger.info(format_output) logger.info("--------------------------") except Exception as e: logger.error(f"Error executing format command: {e}") # Command to simulate download simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info result['commands']['simulate'] = simulate_cmd logger.info(f"Simulate download command: {simulate_cmd}") # Execute and log the simulation command logger.info("\n--- Executing Simulation Command ---") try: logger.info(f"Running: {simulate_cmd}") simulate_output = os.popen(simulate_cmd).read() logger.info("--- Simulation Output ---") logger.info(simulate_output) logger.info("-------------------------") except Exception as e: logger.error(f"Error executing simulation command: {e}") # Basic download command download_cmd = base_cmd result['commands']['download_base'] = download_cmd logger.info(f"Base download command (add format selection, output path): {download_cmd}") # Push generated example commands to XCom for potential downstream use # ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested # ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested ti.xcom_push(key='download_cmd', value=download_cmd) logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}") except json.JSONDecodeError as e: error_msg = f"Failed to parse info.json file '{info_json_path}': {e}" logger.error(error_msg) result['error'] = error_msg except FileNotFoundError: error_msg = f"Info.json file not found at path: {info_json_path}" logger.error(error_msg) result['error'] = error_msg except Exception as e: error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}" logger.error(error_msg, exc_info=True) result['error'] = error_msg elif info_json_path: error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist." logger.warning(error_msg) result['error'] = error_msg else: logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.") result['error'] = "Info.json path not available." logger.info("Finished display_token_info task.") # Return the collected information (useful if used as a PythonOperator return value) return json.dumps(result) # Return as JSON string for XCom compatibility if needed def store_token_info(**context): """Stores retrieved token information (command, proxy, info.json) in Redis.""" ti = context['task_instance'] # Use the redis_conn_id defined in the operator/DAG params if possible, else default redis_conn_id = context['params'].get('redis_conn_id', 'redis_default') redis_hook = RedisHook(redis_conn_id=redis_conn_id) logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.") try: # Pull necessary data from XCom and context url = context['params'].get('url') if not url: # Attempt to get URL from DAG run conf as fallback url = context.get('dag_run', {}).conf.get('url') if not url: raise ValueError("URL parameter is missing in context['params'] and dag_run.conf") logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.") ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command') socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path') if not ytdlp_command: logger.warning("ytdlp_command not found in XCom. Storing empty value.") ytdlp_command = '' # Store empty if not found # Construct the base command using info.json ytdlp_command_base = '' if info_json_path and os.path.exists(info_json_path): ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\"" logger.info(f"Constructed base command: {ytdlp_command_base}") else: logger.warning("Cannot construct base command: info_json_path not valid.") # Construct the command with tokens and proxy ytdlp_command_tokens = ytdlp_command # Start with original command from server if socks_proxy: ytdlp_command_tokens += f" --proxy \"{socks_proxy}\"" logger.info("Appended proxy to token command.") data_to_store = { 'url': url, 'ytdlp_command': ytdlp_command_base, # Store the base command 'proxy': socks_proxy, 'info_json_path': info_json_path or '' # Store path even if None/empty # 'info_json' will be added below } # Read info.json content if path exists info_json_content = None if info_json_path and os.path.exists(info_json_path): try: with open(info_json_path, 'r', encoding='utf-8') as f: # Read and immediately validate JSON structure before storing info_json_content = json.load(f) # Store the validated JSON as a string data_to_store['info_json'] = json.dumps(info_json_content) logger.info(f"Read and validated info.json content from: {info_json_path}") except json.JSONDecodeError as e: logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.") data_to_store['info_json'] = '' # Store empty string on parse error except Exception as e: logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.") data_to_store['info_json'] = '' # Store empty string on other read errors else: logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.") data_to_store['info_json'] = '' # Store empty string if no path # Determine Redis key using video ID # Use the same helper method as the operator for consistency # Need an instance or static method call. Let's make _extract_video_id static temporarily # Or instantiate the operator just for this - less ideal. # Simplest: Re-implement or assume utils. # Re-implementing basic logic here for simplicity: video_id = None try: if 'youtube.com/watch?v=' in url: video_id = url.split('v=')[1].split('&')[0][:11] elif 'youtu.be/' in url: video_id = url.split('youtu.be/')[1].split('?')[0][:11] except Exception: pass # Ignore errors in ID extraction for key generation redis_key = f"token_info:{video_id or 'unknown'}" logger.info(f"Determined Redis key: {redis_key}") # Store data in Redis hash # Log presence/absence rather than full content for potentially large fields logger.info(f"Data to store in Redis key '{redis_key}': " f"URL='{data_to_store['url']}', " f"Command={'' if data_to_store['ytdlp_command'] else ''}, " f"Proxy='{data_to_store['proxy'] or ''}', " f"Path='{data_to_store['info_json_path'] or ''}', " f"JSON Content={'' if data_to_store.get('info_json') else ''}") with redis_hook.get_conn() as redis_client: # Extract video ID from URL video_id = None try: if 'youtube.com/watch?v=' in url: video_id = url.split('v=')[1].split('&')[0][:11] elif 'youtu.be/' in url: video_id = url.split('youtu.be/')[1].split('?')[0][:11] except Exception: pass # Ignore errors in ID extraction for key generation # Use video ID as part of the Redis key redis_key = f"token_info:{video_id or 'unknown'}" logger.info(f"Determined Redis key: {redis_key}") # Store data in Redis hash # Add video_id, timestamp, and the constructed ytdlp_command_tokens data_to_store['video_id'] = video_id or 'unknown' data_to_store['timestamp'] = int(time.time()) data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command # Log fields being stored log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()} logger.info(f"Storing in Redis key '{redis_key}': {log_data}") redis_client.hset(redis_key, mapping=data_to_store) # Set expiration (e.g., 24 hours = 86400 seconds) redis_client.expire(redis_key, 86400) logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.") # Log the final stored data again for clarity final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()} logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---") logger.info(final_log_data) logger.info("----------------------------------------------------") except Exception as e: logger.error(f"Failed to store token info in Redis: {e}", exc_info=True) # Re-raise as AirflowException to fail the task raise AirflowException(f"Failed to store token info in Redis: {e}") logger.info("Finished store_token_info task.") # ============================================================================= # DAG Definition # ============================================================================= # Update default_args to match ytdlp_client_dag.py structure default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email_on_failure': False, # Match reference DAG 'email_on_retry': False, # Match reference DAG 'retries': 1, # Default task retries 'retry_delay': timedelta(minutes=5), # Standard task retry delay 'start_date': days_ago(1) # Best practice start date } # Update DAG definition with DAG( dag_id='ytdlp_client_dag_v2.1', default_args=default_args, schedule_interval=None, # Manually triggered DAG catchup=False, # Don't run for past missed schedules description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering params={ # Define DAG parameters with defaults and types for UI clarity 'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL 'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection 'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP 'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port 'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id 'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."), # Use Airflow Variable for downloads directory, matching reference DAG structure 'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.") } ) as dag: # Define Tasks get_token = YtdlpOpsOperator( task_id='get_token', # Pass templated parameters from DAG run config url="{{ params.url }}", redis_enabled="{{ params.redis_enabled }}", service_ip="{{ params.service_ip }}", service_port="{{ params.service_port }}", account_id="{{ params.account_id }}", save_info_json=True, info_json_dir="{{ params.info_json_dir }}", get_socks_proxy=True, store_socks_proxy=True, timeout="{{ params.timeout }}", retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default retry_delay=RETRY_DELAY, # Operator-specific delay if needed # Add callbacks for logging success/failure, similar to reference DAG on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."), on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.") ) # Add task documentation (visible in Airflow UI) get_token.doc_md = """ ### Get Token Task Connects to the YTDLP Thrift service (either directly or via Redis discovery) to retrieve an authentication token and video metadata (info.json). **Pushes to XCom:** - `info_json_path`: Path to the saved info.json file (or None if not saved/failed). - `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found). - `ytdlp_command`: The original command string received from the server (contains tokens/cookies). - Uses parameters defined in the DAG run configuration. """ # Optional: Add a task to explicitly check XComs for debugging (like in reference DAG) def _check_xcom_callable(**context): """Logs XCom values pushed by the get_token task.""" ti = context['task_instance'] logger.info("--- Checking XCom values pushed by get_token ---") keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command'] xcom_values = {} for key in keys_to_check: value = ti.xcom_pull(task_ids='get_token', key=key) xcom_values[key] = value # Avoid logging potentially sensitive command details fully in production if key == 'ytdlp_command' and value: log_value = f"{value[:50]}..." # Log truncated command else: log_value = value logger.info(f"XCom key='{key}': {log_value}") logger.info("----------------------------------------------") return xcom_values # Return values for potential future use check_xcom_task = PythonOperator( task_id='check_xcom_after_get_token', python_callable=_check_xcom_callable, ) check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes." display_info = PythonOperator( task_id='display_token_info', python_callable=display_token_info, trigger_rule='all_success' ) display_info.doc_md = """ ### Display Token Info Task Pulls information from XCom, parses the `info.json` file (if available), logs video details, and generates example `yt-dlp` commands. **Pulls from XCom (task_id='get_token'):** - `info_json_path` - `socks_proxy` - `ytdlp_command` **Pushes to XCom:** - `download_cmd`: Base command using `--load-info-json` (user needs to add format/output). """ store_info = PythonOperator( task_id='store_token_info', # Use consistent task ID naming python_callable=store_token_info, ) store_info.doc_md = """ ### Store Token Info Task Pulls information from XCom and DAG parameters, reads the `info.json` content, and stores relevant data in a Redis hash. **Pulls from XCom (task_id='get_token'):** - `ytdlp_command` - `socks_proxy` - `info_json_path` **Pulls from DAG context:** - `params['url']` (or `dag_run.conf['url']`) **Stores in Redis Hash (key: `token_info:`):** - `url`: The video URL. - `ytdlp_command`: Base command using `--load-info-json`. - `proxy`: The SOCKS proxy string. - `info_json_path`: Path to the saved info.json file. - `info_json`: The full content of the info.json file (as a JSON string). - `video_id`: Extracted video ID. - `timestamp`: Unix timestamp of storage. - `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies). Sets a 24-hour expiration on the Redis key. """ # Define task dependencies matching the reference DAG structure get_token >> check_xcom_task >> display_info >> store_info