yt-dlp-dags/dags/ytdlp_client_dag_v2.1.py

from airflow import DAG
from airflow.models import BaseOperator, Variable
from airflow.utils.decorators import apply_defaults
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from airflow.utils.dates import days_ago
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
from thrift.transport.TTransport import TTransportException
from datetime import datetime, timedelta
from pangramia.yt.exceptions.ttypes import PBServiceException
import redis
import logging
import time
import socket
import json
import os
from pangramia.yt.tokens_ops import YTTokenOpService
from pangramia.yt.common.ttypes import TokenUpdateMode
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.operators.python import PythonOperator
from airflow.models.param import Param
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id

# Configure logging
logger = logging.getLogger(__name__)

# Default settings (similar to ytdlp_client_dag.py)
MAX_RETRIES = 1
RETRY_DELAY = timedelta(seconds=10)
DEFAULT_TIMEOUT = 30

class YtdlpOpsOperator(BaseOperator):
    """
    Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
    and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
    """
    template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')

    @apply_defaults
    def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
                 service_ip=None, service_port=None, redis_enabled=False, account_id=None,
                 save_info_json=True, info_json_dir=None, get_socks_proxy=True,
                 store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
        super().__init__(*args, **kwargs)

        logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
                    f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
                    f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
                    f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
                    f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")

        # Validate required parameters
        if not url:
            raise ValueError("url is required")

        # Validate parameters based on connection mode
        if redis_enabled:
            if not account_id:
                raise ValueError("account_id is required when redis_enabled=True")
            # Use default Redis connection if not specified
            if not redis_conn_id:
                redis_conn_id = 'redis_default'
                logger.info(f"Using default Redis connection ID: {redis_conn_id}")
        else:
            if not service_ip or not service_port:
                raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
            if not account_id:
                logger.warning("No account_id provided for direct connection mode. Using 'default'")
                account_id = 'default' # Assign default if missing in direct mode

        self.url = url
        self.redis_conn_id = redis_conn_id
        self.max_retries = max_retries
        self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
        self.service_ip = service_ip
        self.service_port = service_port
        self.redis_enabled = redis_enabled
        self.account_id = account_id
        self.save_info_json = save_info_json
        self.info_json_dir = info_json_dir
        self.get_socks_proxy = get_socks_proxy
        self.store_socks_proxy = store_socks_proxy
        self.timeout = timeout

    def execute(self, context):
        logger.info("Executing YtdlpOpsOperator")
        transport = None
        try:
            logger.info("Getting task parameters")
            params = context.get('params', {})
            redis_enabled = params.get('redis_enabled', self.redis_enabled)
            logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")

            # Determine account_id to use (from params or operator default)
            account_id = context['params'].get('account_id', self.account_id)
            logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")

            if redis_enabled:
                # Get Redis connection with proper authentication and error handling
                redis_conn = BaseHook.get_connection(self.redis_conn_id)
                redis_client = redis.Redis(
                    host=redis_conn.host,
                    port=redis_conn.port,
                    password=redis_conn.password,
                    db=0,
                    decode_responses=True # Important for consistent key handling
                )

                # Test Redis connection
                try:
                    if not redis_client.ping():
                        raise redis.exceptions.ConnectionError("Redis ping failed")
                    logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
                except redis.exceptions.AuthenticationError:
                    logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
                    raise AirflowException("Redis authentication failed.")
                except redis.exceptions.ConnectionError as e:
                    logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
                    raise AirflowException(f"Redis connection failed: {e}")
                except Exception as e:
                    logger.error(f"Unexpected Redis error: {str(e)}")
                    raise AirflowException(f"Unexpected Redis error: {e}")

                # Get service details from Redis with retries and proper key handling
                service_key = f"ytdlp:{account_id}"
                legacy_key = account_id # For backward compatibility

                host = None
                port = None
                for attempt in range(self.max_retries):
                    try:
                        logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
                        service_details = redis_client.hgetall(service_key)
                        if not service_details:
                            logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
                            service_details = redis_client.hgetall(legacy_key)

                        if not service_details:
                            raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")

                        # Find IP and port, handling potential case differences and byte/string types
                        ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
                        port_key = next((k for k in service_details if k.lower() == 'port'), None)

                        if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
                        if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")

                        host = service_details[ip_key] # Already decoded due to decode_responses=True
                        port_str = service_details[port_key]

                        try:
                            port = int(port_str)
                        except ValueError:
                            raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")

                        logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
                        break # Success

                    except Exception as e:
                        logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
                        if attempt == self.max_retries - 1:
                            logger.error("Max retries reached for fetching Redis details.")
                            raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
                        logger.info(f"Retrying in {self.retry_delay} seconds...")
                        time.sleep(self.retry_delay)
            else:
                # Direct connection: Log parameter sources
                params = context.get('params', {})
                host = params.get('service_ip', self.service_ip)
                host_source = 'task params' if 'service_ip' in params else 'operator init'
                port_str = params.get('service_port', self.service_port)
                port_source = 'task params' if 'service_port' in params else 'operator init'
                url = params.get('url', self.url)
                url_source = 'task params' if 'url' in params else 'operator init'

                logger.info(f"Using service_ip={host} (from {host_source})")
                logger.info(f"Using service_port={port_str} (from {port_source})")
                logger.info(f"Using url={url} (from {url_source})")

                if not host or not port_str:
                    raise ValueError("Direct connection requires service_ip and service_port")
                try:
                    port = int(port_str)
                except ValueError:
                     raise ValueError(f"Invalid service_port value: {port_str}")

                logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")

            # Render and validate timeout
            timeout_param = context.get('params', {}).get('timeout', self.timeout)
            if isinstance(self.timeout, str) and '{{' in self.timeout:
                timeout_rendered = self.render_template(self.timeout, context)
                logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
                timeout_param = timeout_rendered
            try:
                timeout = int(timeout_param)
                if timeout <= 0: raise ValueError("Timeout must be positive")
                logger.info(f"Using timeout: {timeout} seconds")
            except (ValueError, TypeError):
                logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
                timeout = DEFAULT_TIMEOUT

            # Create Thrift connection objects
            socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
            socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
            transport = TTransport.TFramedTransport(socket_conn)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)
            client = YTTokenOpService.Client(protocol)

            logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
            try:
                transport.open()
                logger.info("Successfully connected to Thrift server.")

                # Test connection with ping
                try:
                    client.ping()
                    logger.info("Server ping successful.")
                except Exception as e:
                    logger.error(f"Server ping failed: {e}")
                    raise AirflowException(f"Server connection test (ping) failed: {e}")

                # Get token from service with specific error handling
                try:
                    url_param = context.get('params', {}).get('url', self.url)
                    logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
                    token_data = client.getOrRefreshToken(
                        accountId=account_id,
                        updateType=TokenUpdateMode.AUTO,
                        url=url_param
                    )
                    logger.info("Successfully retrieved token data from service.")
                except PBServiceException as e:
                    logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
                    error_code = getattr(e, 'errorCode', None)
                    error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
                    # Handle specific known error codes
                    if error_code in [
                        "SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
                        "SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
                        "SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
                    ]:
                        error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
                    elif error_code == "BOT_DETECTION":
                        error_msg = f"Bot detection triggered ({error_code}): {e.message}."
                        suggestions = getattr(e, 'context', {}).get('suggestions', [])
                        if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
                    elif error_code == "NODEJS_SCRIPT_ERROR":
                        error_msg = f"Node.js script error ({error_code}): {e.message}."
                    elif error_code == "NODEJS_TIMEOUT":
                        error_msg = f"Node.js timeout ({error_code}): {e.message}."
                    # Add more specific error handling as needed
                    raise AirflowException(error_msg)
                except TTransportException as e:
                    logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
                    raise AirflowException(f"Transport error during API call: {e}")
                except Exception as e:
                    logger.error(f"Unexpected error during getOrRefreshToken: {e}")
                    raise AirflowException(f"Unexpected error during API call: {e}")

            except TTransportException as e:
                # Handle connection-specific transport errors
                if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
                    logger.error(f"Connection failed to {host}:{port}. Details: {e}")
                    logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
                    raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
                else:
                    logger.error(f"Thrift transport error during connection: {str(e)}")
                    raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
            except Exception as e:
                logger.error(f"Unexpected error during connection or ping: {str(e)}")
                raise # Re-raise other unexpected errors

            # Log received token data attributes for debugging
            logger.debug(f"Token data received. Attributes: {dir(token_data)}")
            for attr in dir(token_data):
                if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
                    value = getattr(token_data, attr)
                    if attr == 'infoJson' and value:
                        logger.debug(f"infoJson: {value[:50]}...")
                    else:
                        logger.debug(f"{attr}: {value}")

            info_json_path = None # Initialize info_json_path

            save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
            # Render if it's a string template
            if isinstance(save_info_json_param, str):
                 save_info_json_rendered = self.render_template(save_info_json_param, context)
                 # Convert common string representations to boolean
                 save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
            else:
                 save_info_json = bool(save_info_json_param)


            # Save info.json if requested and valid
            if self.save_info_json:
                info_json = self._get_info_json(token_data)
                if info_json and self._is_valid_json(info_json):
                    try:
                        # Use internal _save_info_json method which handles rendering, dir creation, logging
                        info_json_path = self._save_info_json(context, info_json)
                        if info_json_path: # Check if saving was successful
                            context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
                            logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
                        else:
                            # _save_info_json should log errors, push None to indicate failure
                            context['task_instance'].xcom_push(key='info_json_path', value=None)
                            logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
                    except Exception as e:
                        logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
                        context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
                elif info_json:
                    logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
                    context['task_instance'].xcom_push(key='info_json_path', value=None)
                else:
                    logger.info("No infoJson found in token data. Skipping save.")
                    context['task_instance'].xcom_push(key='info_json_path', value=None)
            else:
                logger.info("save_info_json is False. Skipping info.json save.")
                context['task_instance'].xcom_push(key='info_json_path', value=None)


            # Extract and potentially store SOCKS proxy
            socks_proxy = None
            if self.get_socks_proxy: # Use instance attribute
                # Check for common attribute names for proxy
                proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
                if proxy_attr:
                    socks_proxy = getattr(token_data, proxy_attr)
                    if socks_proxy: # Ensure proxy value is not empty
                         logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
                         if self.store_socks_proxy: # Use instance attribute
                             context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
                             logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
                         else:
                             logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
                    else:
                        logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
                        # Push None even if found but empty, if storing is enabled
                        if self.store_socks_proxy: # Use instance attribute
                            context['task_instance'].xcom_push(key='socks_proxy', value=None)
                            logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
                else:
                    logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
                    # Push None if storing is enabled but attribute not found
                    if self.store_socks_proxy: # Use instance attribute
                        context['task_instance'].xcom_push(key='socks_proxy', value=None)
                        logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
            else:
                logger.info("get_socks_proxy is False. Skipping proxy extraction.")
                # Push None if storing is enabled but extraction was skipped
                if self.store_socks_proxy: # Use instance attribute
                    context['task_instance'].xcom_push(key='socks_proxy', value=None)
                    logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")


            # Get the original command from the server
            ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
            if not ytdlp_cmd:
                logger.error("No 'ytdlpCommand' attribute found in token data.")
                raise AirflowException("Required 'ytdlpCommand' not received from service.")

            logger.info(f"Original command received from server: {ytdlp_cmd}")

            # Log example usage command (DO NOT MODIFY the original command here)
            if info_json_path:
                # Use double quotes for paths/proxy in example for robustness
                example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
                if socks_proxy:
                    example_cmd += f" --proxy \"{socks_proxy}\""
                example_cmd += " --verbose --simulate" # Add useful flags for testing
                logger.info(f"\n--- Example usage with saved info.json ---")
                logger.info(example_cmd)
                logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
                latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
                logger.info(f"(You can also use 'latest.json': {latest_json_path})")
                logger.info(f"-------------------------------------------\n")

            else:
                logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
                if socks_proxy:
                     logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
                logger.info("Add --verbose and --simulate flags for testing the command.")
                logger.info(f"-------------------------------------------------------\n")


            # Push the *original* command to XCom
            context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
            logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")

            # Note: Returning ytdlp_cmd below implicitly pushes the same value
            # to XCom under the key 'return_value'. Downstream tasks should
            # preferably use the explicitly pushed 'ytdlp_command' key for clarity.
            return ytdlp_cmd # Return the original command

        except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
             logger.error(f"Operation failed due to AirflowException: {e}")
             raise # Re-raise AirflowExceptions to ensure task failure
        except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
            logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
            raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
        except Exception as e: # General catch-all for truly unexpected errors
            # Log with traceback for unexpected errors
            logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
            # Ensure any unexpected error explicitly fails the task with AirflowException
            raise AirflowException(f"Unexpected error caused task failure: {e}")
        finally:
            if transport and transport.isOpen(): # Check if transport exists and is open before closing
                logger.info("Closing Thrift transport.")
                transport.close()

    # --- Helper Methods ---

    def _get_info_json(self, token_data):
        """Safely extracts infoJson from token data."""
        info_json = getattr(token_data, 'infoJson', None)
        if info_json:
            logger.debug("Extracted infoJson from token data.")
        else:
            logger.debug("No infoJson attribute found in token data.")
        return info_json

    def _is_valid_json(self, json_str):
        """Checks if a string is valid JSON."""
        if not json_str or not isinstance(json_str, str):
            logger.debug("Input is not a non-empty string, considered invalid JSON.")
            return False
        try:
            json.loads(json_str)
            logger.debug("JSON string validation successful.")
            return True
        except json.JSONDecodeError as e:
            logger.warning(f"JSON validation failed: {e}")
            return False

    def _save_info_json(self, context, info_json):
        """Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
        try:
            # Get URL from params/context for video ID extraction
            url_param = context.get('params', {}).get('url', self.url)
            video_id = self._extract_video_id(url_param) # Use internal helper

            # Render the info_json_dir template
            save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
            save_dir = self.render_template(save_dir_template, context)
            if not save_dir: # Handle case where template renders to empty string
                logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
                save_dir = "."
            logger.info(f"Target directory for info.json (rendered): {save_dir}")

            # Ensure directory exists
            try:
                os.makedirs(save_dir, exist_ok=True)
                logger.info(f"Ensured directory exists: {save_dir}")
            except OSError as e:
                logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
                return None # Indicate failure

            # Construct filename (using potentially overridden account_id)
            account_id_param = context.get('params', {}).get('account_id', self.account_id)
            timestamp = int(time.time())
            base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
            info_json_path = os.path.join(save_dir, base_filename)
            latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy

            # Write to timestamped file
            try:
                logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
                with open(info_json_path, 'w', encoding='utf-8') as f:
                    f.write(info_json)
                logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
            except IOError as e:
                 logger.error(f"Failed to write info.json to {info_json_path}: {e}")
                 return None # Indicate failure

            # Write to latest.json (overwrite) - best effort
            try:
                with open(latest_json_path, 'w', encoding='utf-8') as f:
                    f.write(info_json)
                logger.info(f"Updated latest.json file: {latest_json_path}")
            except IOError as e:
                # Log warning but don't fail the whole save if only latest.json fails
                logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")

            return info_json_path # Return path on success (even if latest.json failed)

        except Exception as e:
            logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
            return None # Indicate failure

    def _extract_video_id(self, url):
        """Extracts YouTube video ID from URL (internal helper)."""
        if not url or not isinstance(url, str):
            logger.debug("URL is empty or not a string, cannot extract video ID.")
            return None
        try:
            # Basic extraction logic (can be enhanced for more URL types)
            video_id = None
            if 'youtube.com/watch?v=' in url:
                video_id = url.split('v=')[1].split('&')[0]
            elif 'youtu.be/' in url:
                video_id = url.split('youtu.be/')[1].split('?')[0]

            # Ensure it looks like a video ID (typically 11 chars, but can vary)
            if video_id and len(video_id) >= 11:
                 video_id = video_id[:11] # Take first 11 chars as standard ID length
                 logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
                 return video_id
            else:
                 logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
                 return None
        except Exception as e:
            logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
            return None


# =============================================================================
# Python Callables for Tasks
# =============================================================================

def display_token_info(**context):
    """Displays token info from XCom, parses info.json, and logs example commands."""
    ti = context['task_instance']
    logger.info("Starting display_token_info task.")

    # Pull data from XCom (provide default values)
    info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
    socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
    ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')

    logger.info("\n=== Pulled Token Information from XCom ===")
    logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
    logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
    logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")

    result = {
        'info_path': info_json_path,
        'proxy': socks_proxy,
        'ytdlp_command': ytdlp_command,
        'video_info': None,
        'commands': {},
        'error': None
    }

    if info_json_path and os.path.exists(info_json_path):
        logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
        try:
            with open(info_json_path, 'r', encoding='utf-8') as f:
                info = json.load(f)

            # Extract and log basic video info safely
            title = info.get('title', 'Unknown Title')
            uploader = info.get('uploader', 'Unknown Author')
            duration = info.get('duration_string', 'Unknown Length')
            upload_date_str = info.get('upload_date') # Format: YYYYMMDD
            upload_date_formatted = 'Unknown Date'
            if upload_date_str:
                try:
                    # Validate format before parsing
                    if len(upload_date_str) == 8 and upload_date_str.isdigit():
                        upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
                    else:
                        logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
                except ValueError:
                    logger.warning(f"Could not parse upload_date '{upload_date_str}'")

            result['video_info'] = {
                'title': title,
                'uploader': uploader,
                'upload_date': upload_date_formatted, # Store formatted date
                'duration': duration
            }

            logger.info(f"Title: {title}")
            logger.info(f"Author: {uploader}")
            logger.info(f"Date: {upload_date_formatted}")
            logger.info(f"Length: {duration}")

            logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
            base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
            if socks_proxy:
                base_cmd += f" --proxy \"{socks_proxy}\""

            # Command to list formats
            format_cmd = f"{base_cmd} -F"
            result['commands']['format'] = format_cmd
            logger.info(f"List formats command: {format_cmd}")

            # Execute and log the format listing command
            logger.info("\n--- Executing Format List Command ---")
            try:
                # Use os.popen for simplicity, capture output
                logger.info(f"Running: {format_cmd}")
                format_output = os.popen(format_cmd).read()
                logger.info("--- Format List Output ---")
                logger.info(format_output)
                logger.info("--------------------------")
            except Exception as e:
                logger.error(f"Error executing format command: {e}")

            # Command to simulate download
            simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
            result['commands']['simulate'] = simulate_cmd
            logger.info(f"Simulate download command: {simulate_cmd}")

            # Execute and log the simulation command
            logger.info("\n--- Executing Simulation Command ---")
            try:
                logger.info(f"Running: {simulate_cmd}")
                simulate_output = os.popen(simulate_cmd).read()
                logger.info("--- Simulation Output ---")
                logger.info(simulate_output)
                logger.info("-------------------------")
            except Exception as e:
                 logger.error(f"Error executing simulation command: {e}")

            # Basic download command
            download_cmd = base_cmd
            result['commands']['download_base'] = download_cmd
            logger.info(f"Base download command (add format selection, output path): {download_cmd}")

            # Push generated example commands to XCom for potential downstream use
            # ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
            # ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
            ti.xcom_push(key='download_cmd', value=download_cmd)
            logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")

        except json.JSONDecodeError as e:
            error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
            logger.error(error_msg)
            result['error'] = error_msg
        except FileNotFoundError:
            error_msg = f"Info.json file not found at path: {info_json_path}"
            logger.error(error_msg)
            result['error'] = error_msg
        except Exception as e:
            error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
            logger.error(error_msg, exc_info=True)
            result['error'] = error_msg
    elif info_json_path:
        error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
        logger.warning(error_msg)
        result['error'] = error_msg
    else:
        logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
        result['error'] = "Info.json path not available."

    logger.info("Finished display_token_info task.")
    # Return the collected information (useful if used as a PythonOperator return value)
    return json.dumps(result) # Return as JSON string for XCom compatibility if needed


def store_token_info(**context):
    """Stores retrieved token information (command, proxy, info.json) in Redis."""
    ti = context['task_instance']
    # Use the redis_conn_id defined in the operator/DAG params if possible, else default
    redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
    redis_hook = RedisHook(redis_conn_id=redis_conn_id)
    logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")

    try:
        # Pull necessary data from XCom and context
        url = context['params'].get('url')
        if not url:
            # Attempt to get URL from DAG run conf as fallback
            url = context.get('dag_run', {}).conf.get('url')
            if not url:
                 raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
            logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")


        ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
        socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
        info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')

        if not ytdlp_command:
            logger.warning("ytdlp_command not found in XCom. Storing empty value.")
            ytdlp_command = '' # Store empty if not found

        # Construct the base command using info.json
        ytdlp_command_base = ''
        if info_json_path and os.path.exists(info_json_path):
            ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
            logger.info(f"Constructed base command: {ytdlp_command_base}")
        else:
            logger.warning("Cannot construct base command: info_json_path not valid.")

        # Construct the command with tokens and proxy
        ytdlp_command_tokens = ytdlp_command # Start with original command from server
        if socks_proxy:
            ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
            logger.info("Appended proxy to token command.")

        data_to_store = {
            'url': url,
            'ytdlp_command': ytdlp_command_base, # Store the base command
            'proxy': socks_proxy,
            'info_json_path': info_json_path or '' # Store path even if None/empty
            # 'info_json' will be added below
        }

        # Read info.json content if path exists
        info_json_content = None
        if info_json_path and os.path.exists(info_json_path):
            try:
                with open(info_json_path, 'r', encoding='utf-8') as f:
                    # Read and immediately validate JSON structure before storing
                    info_json_content = json.load(f)
                # Store the validated JSON as a string
                data_to_store['info_json'] = json.dumps(info_json_content)
                logger.info(f"Read and validated info.json content from: {info_json_path}")
            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
                data_to_store['info_json'] = '' # Store empty string on parse error
            except Exception as e:
                logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
                data_to_store['info_json'] = '' # Store empty string on other read errors
        else:
            logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
            data_to_store['info_json'] = '' # Store empty string if no path

        # Determine Redis key using video ID
        # Use the same helper method as the operator for consistency
        # Need an instance or static method call. Let's make _extract_video_id static temporarily
        # Or instantiate the operator just for this - less ideal.
        # Simplest: Re-implement or assume utils.
        # Re-implementing basic logic here for simplicity:
        video_id = None
        try:
            if 'youtube.com/watch?v=' in url:
                video_id = url.split('v=')[1].split('&')[0][:11]
            elif 'youtu.be/' in url:
                video_id = url.split('youtu.be/')[1].split('?')[0][:11]
        except Exception:
            pass # Ignore errors in ID extraction for key generation
        redis_key = f"token_info:{video_id or 'unknown'}"
        logger.info(f"Determined Redis key: {redis_key}")

        # Store data in Redis hash
        # Log presence/absence rather than full content for potentially large fields
        logger.info(f"Data to store in Redis key '{redis_key}': "
                    f"URL='{data_to_store['url']}', "
                    f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
                    f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
                    f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
                    f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")

        with redis_hook.get_conn() as redis_client:
            # Extract video ID from URL
            video_id = None
            try:
                if 'youtube.com/watch?v=' in url:
                    video_id = url.split('v=')[1].split('&')[0][:11]
                elif 'youtu.be/' in url:
                    video_id = url.split('youtu.be/')[1].split('?')[0][:11]
            except Exception:
                pass  # Ignore errors in ID extraction for key generation

            # Use video ID as part of the Redis key
            redis_key = f"token_info:{video_id or 'unknown'}"
            logger.info(f"Determined Redis key: {redis_key}")

            # Store data in Redis hash
            # Add video_id, timestamp, and the constructed ytdlp_command_tokens
            data_to_store['video_id'] = video_id or 'unknown'
            data_to_store['timestamp'] = int(time.time())
            data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command

            # Log fields being stored
            log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
            logger.info(f"Storing in Redis key '{redis_key}': {log_data}")

            redis_client.hset(redis_key, mapping=data_to_store)
            # Set expiration (e.g., 24 hours = 86400 seconds)
            redis_client.expire(redis_key, 86400)
            logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
            # Log the final stored data again for clarity
            final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
            logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
            logger.info(final_log_data)
            logger.info("----------------------------------------------------")


    except Exception as e:
        logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
        # Re-raise as AirflowException to fail the task
        raise AirflowException(f"Failed to store token info in Redis: {e}")

    logger.info("Finished store_token_info task.")


# =============================================================================
# DAG Definition
# =============================================================================

# Update default_args to match ytdlp_client_dag.py structure
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False, # Match reference DAG
    'email_on_retry': False,   # Match reference DAG
    'retries': 1,              # Default task retries
    'retry_delay': timedelta(minutes=5), # Standard task retry delay
    'start_date': days_ago(1)  # Best practice start date
}

# Update DAG definition
with DAG(
    dag_id='ytdlp_client_dag_v2.1',
    default_args=default_args,
    schedule_interval=None,     # Manually triggered DAG
    catchup=False,              # Don't run for past missed schedules
    description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
    tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
    params={
        # Define DAG parameters with defaults and types for UI clarity
        'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
        'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
        'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
        'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
        'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
        'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
        # Use Airflow Variable for downloads directory, matching reference DAG structure
        'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
    }
) as dag:

    # Define Tasks

    get_token = YtdlpOpsOperator(
        task_id='get_token',
        # Pass templated parameters from DAG run config
        url="{{ params.url }}",
        redis_enabled="{{ params.redis_enabled }}",
        service_ip="{{ params.service_ip }}",
        service_port="{{ params.service_port }}",
        account_id="{{ params.account_id }}",
        save_info_json=True,
        info_json_dir="{{ params.info_json_dir }}",
        get_socks_proxy=True,
        store_socks_proxy=True,
        timeout="{{ params.timeout }}",
        retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
        retry_delay=RETRY_DELAY, # Operator-specific delay if needed
        # Add callbacks for logging success/failure, similar to reference DAG
        on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
        on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
    )
    # Add task documentation (visible in Airflow UI)
    get_token.doc_md = """
    ### Get Token Task
    Connects to the YTDLP Thrift service (either directly or via Redis discovery)
    to retrieve an authentication token and video metadata (info.json).

    **Pushes to XCom:**
    - `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
    - `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
    - `ytdlp_command`: The original command string received from the server (contains tokens/cookies).

    - Uses parameters defined in the DAG run configuration.
    """

    # Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
    def _check_xcom_callable(**context):
        """Logs XCom values pushed by the get_token task."""
        ti = context['task_instance']
        logger.info("--- Checking XCom values pushed by get_token ---")
        keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
        xcom_values = {}
        for key in keys_to_check:
            value = ti.xcom_pull(task_ids='get_token', key=key)
            xcom_values[key] = value
            # Avoid logging potentially sensitive command details fully in production
            if key == 'ytdlp_command' and value:
                 log_value = f"{value[:50]}..." # Log truncated command
            else:
                log_value = value
            logger.info(f"XCom key='{key}': {log_value}")
        logger.info("----------------------------------------------")
        return xcom_values # Return values for potential future use

    check_xcom_task = PythonOperator(
        task_id='check_xcom_after_get_token',
        python_callable=_check_xcom_callable,
    )
    check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."

    display_info = PythonOperator(
        task_id='display_token_info',
        python_callable=display_token_info,
        trigger_rule='all_success'
    )
    display_info.doc_md = """
    ### Display Token Info Task
    Pulls information from XCom, parses the `info.json` file (if available),
    logs video details, and generates example `yt-dlp` commands.

    **Pulls from XCom (task_id='get_token'):**
    - `info_json_path`
    - `socks_proxy`
    - `ytdlp_command`

    **Pushes to XCom:**
    - `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
    """

    store_info = PythonOperator(
        task_id='store_token_info', # Use consistent task ID naming
        python_callable=store_token_info,
    )
    store_info.doc_md = """
    ### Store Token Info Task
    Pulls information from XCom and DAG parameters, reads the `info.json` content,
    and stores relevant data in a Redis hash.

    **Pulls from XCom (task_id='get_token'):**
    - `ytdlp_command`
    - `socks_proxy`
    - `info_json_path`

    **Pulls from DAG context:**
    - `params['url']` (or `dag_run.conf['url']`)

    **Stores in Redis Hash (key: `token_info:<video_id>`):**
    - `url`: The video URL.
    - `ytdlp_command`: Base command using `--load-info-json`.
    - `proxy`: The SOCKS proxy string.
    - `info_json_path`: Path to the saved info.json file.
    - `info_json`: The full content of the info.json file (as a JSON string).
    - `video_id`: Extracted video ID.
    - `timestamp`: Unix timestamp of storage.
    - `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).

    Sets a 24-hour expiration on the Redis key.
    """

    # Define task dependencies matching the reference DAG structure
    get_token >> check_xcom_task >> display_info >> store_info