yt-dlp-dags/dags/ytdlp_client_dag_v2.1.py

942 lines
49 KiB
Python

from airflow import DAG
from airflow.models import BaseOperator, Variable
from airflow.utils.decorators import apply_defaults
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
from airflow.utils.dates import days_ago
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
from thrift.transport.TTransport import TTransportException
from datetime import datetime, timedelta
from pangramia.yt.exceptions.ttypes import PBServiceException
import redis
import logging
import time
import socket
import json
import os
from pangramia.yt.tokens_ops import YTTokenOpService
from pangramia.yt.common.ttypes import TokenUpdateMode
from airflow.providers.redis.hooks.redis import RedisHook
from airflow.operators.python import PythonOperator
from airflow.models.param import Param
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
# Configure logging
logger = logging.getLogger(__name__)
# Default settings (similar to ytdlp_client_dag.py)
MAX_RETRIES = 1
RETRY_DELAY = timedelta(seconds=10)
DEFAULT_TIMEOUT = 30
class YtdlpOpsOperator(BaseOperator):
"""
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
"""
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
@apply_defaults
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
super().__init__(*args, **kwargs)
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
# Validate required parameters
if not url:
raise ValueError("url is required")
# Validate parameters based on connection mode
if redis_enabled:
if not account_id:
raise ValueError("account_id is required when redis_enabled=True")
# Use default Redis connection if not specified
if not redis_conn_id:
redis_conn_id = 'redis_default'
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
else:
if not service_ip or not service_port:
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
if not account_id:
logger.warning("No account_id provided for direct connection mode. Using 'default'")
account_id = 'default' # Assign default if missing in direct mode
self.url = url
self.redis_conn_id = redis_conn_id
self.max_retries = max_retries
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
self.service_ip = service_ip
self.service_port = service_port
self.redis_enabled = redis_enabled
self.account_id = account_id
self.save_info_json = save_info_json
self.info_json_dir = info_json_dir
self.get_socks_proxy = get_socks_proxy
self.store_socks_proxy = store_socks_proxy
self.timeout = timeout
def execute(self, context):
logger.info("Executing YtdlpOpsOperator")
transport = None
try:
logger.info("Getting task parameters")
params = context.get('params', {})
redis_enabled = params.get('redis_enabled', self.redis_enabled)
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
# Determine account_id to use (from params or operator default)
account_id = context['params'].get('account_id', self.account_id)
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
if redis_enabled:
# Get Redis connection with proper authentication and error handling
redis_conn = BaseHook.get_connection(self.redis_conn_id)
redis_client = redis.Redis(
host=redis_conn.host,
port=redis_conn.port,
password=redis_conn.password,
db=0,
decode_responses=True # Important for consistent key handling
)
# Test Redis connection
try:
if not redis_client.ping():
raise redis.exceptions.ConnectionError("Redis ping failed")
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
except redis.exceptions.AuthenticationError:
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
raise AirflowException("Redis authentication failed.")
except redis.exceptions.ConnectionError as e:
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
raise AirflowException(f"Redis connection failed: {e}")
except Exception as e:
logger.error(f"Unexpected Redis error: {str(e)}")
raise AirflowException(f"Unexpected Redis error: {e}")
# Get service details from Redis with retries and proper key handling
service_key = f"ytdlp:{account_id}"
legacy_key = account_id # For backward compatibility
host = None
port = None
for attempt in range(self.max_retries):
try:
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
service_details = redis_client.hgetall(service_key)
if not service_details:
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
service_details = redis_client.hgetall(legacy_key)
if not service_details:
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
# Find IP and port, handling potential case differences and byte/string types
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
port_key = next((k for k in service_details if k.lower() == 'port'), None)
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
host = service_details[ip_key] # Already decoded due to decode_responses=True
port_str = service_details[port_key]
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
break # Success
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
if attempt == self.max_retries - 1:
logger.error("Max retries reached for fetching Redis details.")
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
logger.info(f"Retrying in {self.retry_delay} seconds...")
time.sleep(self.retry_delay)
else:
# Direct connection: Log parameter sources
params = context.get('params', {})
host = params.get('service_ip', self.service_ip)
host_source = 'task params' if 'service_ip' in params else 'operator init'
port_str = params.get('service_port', self.service_port)
port_source = 'task params' if 'service_port' in params else 'operator init'
url = params.get('url', self.url)
url_source = 'task params' if 'url' in params else 'operator init'
logger.info(f"Using service_ip={host} (from {host_source})")
logger.info(f"Using service_port={port_str} (from {port_source})")
logger.info(f"Using url={url} (from {url_source})")
if not host or not port_str:
raise ValueError("Direct connection requires service_ip and service_port")
try:
port = int(port_str)
except ValueError:
raise ValueError(f"Invalid service_port value: {port_str}")
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
# Render and validate timeout
timeout_param = context.get('params', {}).get('timeout', self.timeout)
if isinstance(self.timeout, str) and '{{' in self.timeout:
timeout_rendered = self.render_template(self.timeout, context)
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
timeout_param = timeout_rendered
try:
timeout = int(timeout_param)
if timeout <= 0: raise ValueError("Timeout must be positive")
logger.info(f"Using timeout: {timeout} seconds")
except (ValueError, TypeError):
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
timeout = DEFAULT_TIMEOUT
# Create Thrift connection objects
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
transport = TTransport.TFramedTransport(socket_conn)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = YTTokenOpService.Client(protocol)
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
try:
transport.open()
logger.info("Successfully connected to Thrift server.")
# Test connection with ping
try:
client.ping()
logger.info("Server ping successful.")
except Exception as e:
logger.error(f"Server ping failed: {e}")
raise AirflowException(f"Server connection test (ping) failed: {e}")
# Get token from service with specific error handling
try:
url_param = context.get('params', {}).get('url', self.url)
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
token_data = client.getOrRefreshToken(
accountId=account_id,
updateType=TokenUpdateMode.AUTO,
url=url_param
)
logger.info("Successfully retrieved token data from service.")
except PBServiceException as e:
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
error_code = getattr(e, 'errorCode', None)
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
# Handle specific known error codes
if error_code in [
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
]:
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
elif error_code == "BOT_DETECTION":
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
suggestions = getattr(e, 'context', {}).get('suggestions', [])
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
elif error_code == "NODEJS_SCRIPT_ERROR":
error_msg = f"Node.js script error ({error_code}): {e.message}."
elif error_code == "NODEJS_TIMEOUT":
error_msg = f"Node.js timeout ({error_code}): {e.message}."
# Add more specific error handling as needed
raise AirflowException(error_msg)
except TTransportException as e:
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
raise AirflowException(f"Transport error during API call: {e}")
except Exception as e:
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
raise AirflowException(f"Unexpected error during API call: {e}")
except TTransportException as e:
# Handle connection-specific transport errors
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
else:
logger.error(f"Thrift transport error during connection: {str(e)}")
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error during connection or ping: {str(e)}")
raise # Re-raise other unexpected errors
# Log received token data attributes for debugging
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
for attr in dir(token_data):
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
value = getattr(token_data, attr)
if attr == 'infoJson' and value:
logger.debug(f"infoJson: {value[:50]}...")
else:
logger.debug(f"{attr}: {value}")
info_json_path = None # Initialize info_json_path
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
# Render if it's a string template
if isinstance(save_info_json_param, str):
save_info_json_rendered = self.render_template(save_info_json_param, context)
# Convert common string representations to boolean
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
else:
save_info_json = bool(save_info_json_param)
# Save info.json if requested and valid
if self.save_info_json:
info_json = self._get_info_json(token_data)
if info_json and self._is_valid_json(info_json):
try:
# Use internal _save_info_json method which handles rendering, dir creation, logging
info_json_path = self._save_info_json(context, info_json)
if info_json_path: # Check if saving was successful
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
else:
# _save_info_json should log errors, push None to indicate failure
context['task_instance'].xcom_push(key='info_json_path', value=None)
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
except Exception as e:
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
elif info_json:
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("No infoJson found in token data. Skipping save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
else:
logger.info("save_info_json is False. Skipping info.json save.")
context['task_instance'].xcom_push(key='info_json_path', value=None)
# Extract and potentially store SOCKS proxy
socks_proxy = None
if self.get_socks_proxy: # Use instance attribute
# Check for common attribute names for proxy
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
if proxy_attr:
socks_proxy = getattr(token_data, proxy_attr)
if socks_proxy: # Ensure proxy value is not empty
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
else:
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
else:
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
# Push None even if found but empty, if storing is enabled
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
else:
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
# Push None if storing is enabled but attribute not found
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
else:
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
# Push None if storing is enabled but extraction was skipped
if self.store_socks_proxy: # Use instance attribute
context['task_instance'].xcom_push(key='socks_proxy', value=None)
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
# Get the original command from the server
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
if not ytdlp_cmd:
logger.error("No 'ytdlpCommand' attribute found in token data.")
raise AirflowException("Required 'ytdlpCommand' not received from service.")
logger.info(f"Original command received from server: {ytdlp_cmd}")
# Log example usage command (DO NOT MODIFY the original command here)
if info_json_path:
# Use double quotes for paths/proxy in example for robustness
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
example_cmd += f" --proxy \"{socks_proxy}\""
example_cmd += " --verbose --simulate" # Add useful flags for testing
logger.info(f"\n--- Example usage with saved info.json ---")
logger.info(example_cmd)
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
logger.info(f"-------------------------------------------\n")
else:
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
if socks_proxy:
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
logger.info("Add --verbose and --simulate flags for testing the command.")
logger.info(f"-------------------------------------------------------\n")
# Push the *original* command to XCom
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
# Note: Returning ytdlp_cmd below implicitly pushes the same value
# to XCom under the key 'return_value'. Downstream tasks should
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
return ytdlp_cmd # Return the original command
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
logger.error(f"Operation failed due to AirflowException: {e}")
raise # Re-raise AirflowExceptions to ensure task failure
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
except Exception as e: # General catch-all for truly unexpected errors
# Log with traceback for unexpected errors
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
# Ensure any unexpected error explicitly fails the task with AirflowException
raise AirflowException(f"Unexpected error caused task failure: {e}")
finally:
if transport and transport.isOpen(): # Check if transport exists and is open before closing
logger.info("Closing Thrift transport.")
transport.close()
# --- Helper Methods ---
def _get_info_json(self, token_data):
"""Safely extracts infoJson from token data."""
info_json = getattr(token_data, 'infoJson', None)
if info_json:
logger.debug("Extracted infoJson from token data.")
else:
logger.debug("No infoJson attribute found in token data.")
return info_json
def _is_valid_json(self, json_str):
"""Checks if a string is valid JSON."""
if not json_str or not isinstance(json_str, str):
logger.debug("Input is not a non-empty string, considered invalid JSON.")
return False
try:
json.loads(json_str)
logger.debug("JSON string validation successful.")
return True
except json.JSONDecodeError as e:
logger.warning(f"JSON validation failed: {e}")
return False
def _save_info_json(self, context, info_json):
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
try:
# Get URL from params/context for video ID extraction
url_param = context.get('params', {}).get('url', self.url)
video_id = self._extract_video_id(url_param) # Use internal helper
# Render the info_json_dir template
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
save_dir = self.render_template(save_dir_template, context)
if not save_dir: # Handle case where template renders to empty string
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
save_dir = "."
logger.info(f"Target directory for info.json (rendered): {save_dir}")
# Ensure directory exists
try:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Ensured directory exists: {save_dir}")
except OSError as e:
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
return None # Indicate failure
# Construct filename (using potentially overridden account_id)
account_id_param = context.get('params', {}).get('account_id', self.account_id)
timestamp = int(time.time())
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
info_json_path = os.path.join(save_dir, base_filename)
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
# Write to timestamped file
try:
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
with open(info_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
except IOError as e:
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
return None # Indicate failure
# Write to latest.json (overwrite) - best effort
try:
with open(latest_json_path, 'w', encoding='utf-8') as f:
f.write(info_json)
logger.info(f"Updated latest.json file: {latest_json_path}")
except IOError as e:
# Log warning but don't fail the whole save if only latest.json fails
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
return info_json_path # Return path on success (even if latest.json failed)
except Exception as e:
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
return None # Indicate failure
def _extract_video_id(self, url):
"""Extracts YouTube video ID from URL (internal helper)."""
if not url or not isinstance(url, str):
logger.debug("URL is empty or not a string, cannot extract video ID.")
return None
try:
# Basic extraction logic (can be enhanced for more URL types)
video_id = None
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0]
# Ensure it looks like a video ID (typically 11 chars, but can vary)
if video_id and len(video_id) >= 11:
video_id = video_id[:11] # Take first 11 chars as standard ID length
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
return video_id
else:
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
return None
except Exception as e:
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
return None
# =============================================================================
# Python Callables for Tasks
# =============================================================================
def display_token_info(**context):
"""Displays token info from XCom, parses info.json, and logs example commands."""
ti = context['task_instance']
logger.info("Starting display_token_info task.")
# Pull data from XCom (provide default values)
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
logger.info("\n=== Pulled Token Information from XCom ===")
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
result = {
'info_path': info_json_path,
'proxy': socks_proxy,
'ytdlp_command': ytdlp_command,
'video_info': None,
'commands': {},
'error': None
}
if info_json_path and os.path.exists(info_json_path):
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
info = json.load(f)
# Extract and log basic video info safely
title = info.get('title', 'Unknown Title')
uploader = info.get('uploader', 'Unknown Author')
duration = info.get('duration_string', 'Unknown Length')
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
upload_date_formatted = 'Unknown Date'
if upload_date_str:
try:
# Validate format before parsing
if len(upload_date_str) == 8 and upload_date_str.isdigit():
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
else:
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
except ValueError:
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
result['video_info'] = {
'title': title,
'uploader': uploader,
'upload_date': upload_date_formatted, # Store formatted date
'duration': duration
}
logger.info(f"Title: {title}")
logger.info(f"Author: {uploader}")
logger.info(f"Date: {upload_date_formatted}")
logger.info(f"Length: {duration}")
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
if socks_proxy:
base_cmd += f" --proxy \"{socks_proxy}\""
# Command to list formats
format_cmd = f"{base_cmd} -F"
result['commands']['format'] = format_cmd
logger.info(f"List formats command: {format_cmd}")
# Execute and log the format listing command
logger.info("\n--- Executing Format List Command ---")
try:
# Use os.popen for simplicity, capture output
logger.info(f"Running: {format_cmd}")
format_output = os.popen(format_cmd).read()
logger.info("--- Format List Output ---")
logger.info(format_output)
logger.info("--------------------------")
except Exception as e:
logger.error(f"Error executing format command: {e}")
# Command to simulate download
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
result['commands']['simulate'] = simulate_cmd
logger.info(f"Simulate download command: {simulate_cmd}")
# Execute and log the simulation command
logger.info("\n--- Executing Simulation Command ---")
try:
logger.info(f"Running: {simulate_cmd}")
simulate_output = os.popen(simulate_cmd).read()
logger.info("--- Simulation Output ---")
logger.info(simulate_output)
logger.info("-------------------------")
except Exception as e:
logger.error(f"Error executing simulation command: {e}")
# Basic download command
download_cmd = base_cmd
result['commands']['download_base'] = download_cmd
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
# Push generated example commands to XCom for potential downstream use
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
ti.xcom_push(key='download_cmd', value=download_cmd)
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
except json.JSONDecodeError as e:
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
logger.error(error_msg)
result['error'] = error_msg
except FileNotFoundError:
error_msg = f"Info.json file not found at path: {info_json_path}"
logger.error(error_msg)
result['error'] = error_msg
except Exception as e:
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
logger.error(error_msg, exc_info=True)
result['error'] = error_msg
elif info_json_path:
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
logger.warning(error_msg)
result['error'] = error_msg
else:
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
result['error'] = "Info.json path not available."
logger.info("Finished display_token_info task.")
# Return the collected information (useful if used as a PythonOperator return value)
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
def store_token_info(**context):
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
ti = context['task_instance']
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
try:
# Pull necessary data from XCom and context
url = context['params'].get('url')
if not url:
# Attempt to get URL from DAG run conf as fallback
url = context.get('dag_run', {}).conf.get('url')
if not url:
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
if not ytdlp_command:
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
ytdlp_command = '' # Store empty if not found
# Construct the base command using info.json
ytdlp_command_base = ''
if info_json_path and os.path.exists(info_json_path):
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
logger.info(f"Constructed base command: {ytdlp_command_base}")
else:
logger.warning("Cannot construct base command: info_json_path not valid.")
# Construct the command with tokens and proxy
ytdlp_command_tokens = ytdlp_command # Start with original command from server
if socks_proxy:
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
logger.info("Appended proxy to token command.")
data_to_store = {
'url': url,
'ytdlp_command': ytdlp_command_base, # Store the base command
'proxy': socks_proxy,
'info_json_path': info_json_path or '' # Store path even if None/empty
# 'info_json' will be added below
}
# Read info.json content if path exists
info_json_content = None
if info_json_path and os.path.exists(info_json_path):
try:
with open(info_json_path, 'r', encoding='utf-8') as f:
# Read and immediately validate JSON structure before storing
info_json_content = json.load(f)
# Store the validated JSON as a string
data_to_store['info_json'] = json.dumps(info_json_content)
logger.info(f"Read and validated info.json content from: {info_json_path}")
except json.JSONDecodeError as e:
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on parse error
except Exception as e:
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
data_to_store['info_json'] = '' # Store empty string on other read errors
else:
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
data_to_store['info_json'] = '' # Store empty string if no path
# Determine Redis key using video ID
# Use the same helper method as the operator for consistency
# Need an instance or static method call. Let's make _extract_video_id static temporarily
# Or instantiate the operator just for this - less ideal.
# Simplest: Re-implement or assume utils.
# Re-implementing basic logic here for simplicity:
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Log presence/absence rather than full content for potentially large fields
logger.info(f"Data to store in Redis key '{redis_key}': "
f"URL='{data_to_store['url']}', "
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
with redis_hook.get_conn() as redis_client:
# Extract video ID from URL
video_id = None
try:
if 'youtube.com/watch?v=' in url:
video_id = url.split('v=')[1].split('&')[0][:11]
elif 'youtu.be/' in url:
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
except Exception:
pass # Ignore errors in ID extraction for key generation
# Use video ID as part of the Redis key
redis_key = f"token_info:{video_id or 'unknown'}"
logger.info(f"Determined Redis key: {redis_key}")
# Store data in Redis hash
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
data_to_store['video_id'] = video_id or 'unknown'
data_to_store['timestamp'] = int(time.time())
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
# Log fields being stored
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
redis_client.hset(redis_key, mapping=data_to_store)
# Set expiration (e.g., 24 hours = 86400 seconds)
redis_client.expire(redis_key, 86400)
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
# Log the final stored data again for clarity
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
logger.info(final_log_data)
logger.info("----------------------------------------------------")
except Exception as e:
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
# Re-raise as AirflowException to fail the task
raise AirflowException(f"Failed to store token info in Redis: {e}")
logger.info("Finished store_token_info task.")
# =============================================================================
# DAG Definition
# =============================================================================
# Update default_args to match ytdlp_client_dag.py structure
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False, # Match reference DAG
'email_on_retry': False, # Match reference DAG
'retries': 1, # Default task retries
'retry_delay': timedelta(minutes=5), # Standard task retry delay
'start_date': days_ago(1) # Best practice start date
}
# Update DAG definition
with DAG(
dag_id='ytdlp_client_dag_v2.1',
default_args=default_args,
schedule_interval=None, # Manually triggered DAG
catchup=False, # Don't run for past missed schedules
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
params={
# Define DAG parameters with defaults and types for UI clarity
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
# Use Airflow Variable for downloads directory, matching reference DAG structure
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
}
) as dag:
# Define Tasks
get_token = YtdlpOpsOperator(
task_id='get_token',
# Pass templated parameters from DAG run config
url="{{ params.url }}",
redis_enabled="{{ params.redis_enabled }}",
service_ip="{{ params.service_ip }}",
service_port="{{ params.service_port }}",
account_id="{{ params.account_id }}",
save_info_json=True,
info_json_dir="{{ params.info_json_dir }}",
get_socks_proxy=True,
store_socks_proxy=True,
timeout="{{ params.timeout }}",
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
# Add callbacks for logging success/failure, similar to reference DAG
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
)
# Add task documentation (visible in Airflow UI)
get_token.doc_md = """
### Get Token Task
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
to retrieve an authentication token and video metadata (info.json).
**Pushes to XCom:**
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
- Uses parameters defined in the DAG run configuration.
"""
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
def _check_xcom_callable(**context):
"""Logs XCom values pushed by the get_token task."""
ti = context['task_instance']
logger.info("--- Checking XCom values pushed by get_token ---")
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
xcom_values = {}
for key in keys_to_check:
value = ti.xcom_pull(task_ids='get_token', key=key)
xcom_values[key] = value
# Avoid logging potentially sensitive command details fully in production
if key == 'ytdlp_command' and value:
log_value = f"{value[:50]}..." # Log truncated command
else:
log_value = value
logger.info(f"XCom key='{key}': {log_value}")
logger.info("----------------------------------------------")
return xcom_values # Return values for potential future use
check_xcom_task = PythonOperator(
task_id='check_xcom_after_get_token',
python_callable=_check_xcom_callable,
)
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
display_info = PythonOperator(
task_id='display_token_info',
python_callable=display_token_info,
trigger_rule='all_success'
)
display_info.doc_md = """
### Display Token Info Task
Pulls information from XCom, parses the `info.json` file (if available),
logs video details, and generates example `yt-dlp` commands.
**Pulls from XCom (task_id='get_token'):**
- `info_json_path`
- `socks_proxy`
- `ytdlp_command`
**Pushes to XCom:**
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
"""
store_info = PythonOperator(
task_id='store_token_info', # Use consistent task ID naming
python_callable=store_token_info,
)
store_info.doc_md = """
### Store Token Info Task
Pulls information from XCom and DAG parameters, reads the `info.json` content,
and stores relevant data in a Redis hash.
**Pulls from XCom (task_id='get_token'):**
- `ytdlp_command`
- `socks_proxy`
- `info_json_path`
**Pulls from DAG context:**
- `params['url']` (or `dag_run.conf['url']`)
**Stores in Redis Hash (key: `token_info:<video_id>`):**
- `url`: The video URL.
- `ytdlp_command`: Base command using `--load-info-json`.
- `proxy`: The SOCKS proxy string.
- `info_json_path`: Path to the saved info.json file.
- `info_json`: The full content of the info.json file (as a JSON string).
- `video_id`: Extracted video ID.
- `timestamp`: Unix timestamp of storage.
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
Sets a 24-hour expiration on the Redis key.
"""
# Define task dependencies matching the reference DAG structure
get_token >> check_xcom_task >> display_info >> store_info