942 lines
49 KiB
Python
942 lines
49 KiB
Python
from airflow import DAG
|
|
from airflow.models import BaseOperator, Variable
|
|
from airflow.utils.decorators import apply_defaults
|
|
from airflow.hooks.base import BaseHook
|
|
from airflow.exceptions import AirflowException
|
|
from airflow.utils.dates import days_ago
|
|
from thrift.transport import TSocket, TTransport
|
|
from thrift.protocol import TBinaryProtocol
|
|
from thrift.transport.TTransport import TTransportException
|
|
from datetime import datetime, timedelta
|
|
from pangramia.yt.exceptions.ttypes import PBServiceException
|
|
import redis
|
|
import logging
|
|
import time
|
|
import socket
|
|
import json
|
|
import os
|
|
from pangramia.yt.tokens_ops import YTTokenOpService
|
|
from pangramia.yt.common.ttypes import TokenUpdateMode
|
|
from airflow.providers.redis.hooks.redis import RedisHook
|
|
from airflow.operators.python import PythonOperator
|
|
from airflow.models.param import Param
|
|
# Assuming ytdlp_utils exists in the same directory or PYTHONPATH
|
|
# from ytdlp_utils import get_info_json, is_valid_json, extract_video_id
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default settings (similar to ytdlp_client_dag.py)
|
|
MAX_RETRIES = 1
|
|
RETRY_DELAY = timedelta(seconds=10)
|
|
DEFAULT_TIMEOUT = 30
|
|
|
|
class YtdlpOpsOperator(BaseOperator):
|
|
"""
|
|
Custom Airflow operator to interact with YTDLP Thrift service. Handles direct connections
|
|
and Redis-based discovery, retrieves tokens, saves info.json, and manages errors.
|
|
"""
|
|
template_fields = ('url', 'service_ip', 'service_port', 'account_id', 'timeout', 'info_json_dir')
|
|
|
|
@apply_defaults
|
|
def __init__(self, url, redis_conn_id='redis_default', max_retries=3, retry_delay=10,
|
|
service_ip=None, service_port=None, redis_enabled=False, account_id=None,
|
|
save_info_json=True, info_json_dir=None, get_socks_proxy=True,
|
|
store_socks_proxy=False, timeout=DEFAULT_TIMEOUT, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
|
|
logger.info(f"Initializing YtdlpOpsOperator with parameters: url={url}, "
|
|
f"redis_conn_id={redis_conn_id}, max_retries={max_retries}, retry_delay={retry_delay}, "
|
|
f"service_ip={service_ip}, service_port={service_port}, redis_enabled={redis_enabled}, "
|
|
f"account_id={account_id}, save_info_json={save_info_json}, info_json_dir={info_json_dir}, "
|
|
f"get_socks_proxy={get_socks_proxy}, store_socks_proxy={store_socks_proxy}, timeout={timeout}")
|
|
|
|
# Validate required parameters
|
|
if not url:
|
|
raise ValueError("url is required")
|
|
|
|
# Validate parameters based on connection mode
|
|
if redis_enabled:
|
|
if not account_id:
|
|
raise ValueError("account_id is required when redis_enabled=True")
|
|
# Use default Redis connection if not specified
|
|
if not redis_conn_id:
|
|
redis_conn_id = 'redis_default'
|
|
logger.info(f"Using default Redis connection ID: {redis_conn_id}")
|
|
else:
|
|
if not service_ip or not service_port:
|
|
raise ValueError("Both service_ip and service_port must be specified when redis_enabled=False")
|
|
if not account_id:
|
|
logger.warning("No account_id provided for direct connection mode. Using 'default'")
|
|
account_id = 'default' # Assign default if missing in direct mode
|
|
|
|
self.url = url
|
|
self.redis_conn_id = redis_conn_id
|
|
self.max_retries = max_retries
|
|
self.retry_delay = int(retry_delay.total_seconds() if isinstance(retry_delay, timedelta) else retry_delay)
|
|
self.service_ip = service_ip
|
|
self.service_port = service_port
|
|
self.redis_enabled = redis_enabled
|
|
self.account_id = account_id
|
|
self.save_info_json = save_info_json
|
|
self.info_json_dir = info_json_dir
|
|
self.get_socks_proxy = get_socks_proxy
|
|
self.store_socks_proxy = store_socks_proxy
|
|
self.timeout = timeout
|
|
|
|
def execute(self, context):
|
|
logger.info("Executing YtdlpOpsOperator")
|
|
transport = None
|
|
try:
|
|
logger.info("Getting task parameters")
|
|
params = context.get('params', {})
|
|
redis_enabled = params.get('redis_enabled', self.redis_enabled)
|
|
logger.info(f"Using redis_enabled={redis_enabled} (from {'task params' if 'redis_enabled' in params else 'operator init'})")
|
|
|
|
# Determine account_id to use (from params or operator default)
|
|
account_id = context['params'].get('account_id', self.account_id)
|
|
logger.info(f"Using account_id='{account_id}' (from {'task params' if 'account_id' in params else 'operator init'})")
|
|
|
|
if redis_enabled:
|
|
# Get Redis connection with proper authentication and error handling
|
|
redis_conn = BaseHook.get_connection(self.redis_conn_id)
|
|
redis_client = redis.Redis(
|
|
host=redis_conn.host,
|
|
port=redis_conn.port,
|
|
password=redis_conn.password,
|
|
db=0,
|
|
decode_responses=True # Important for consistent key handling
|
|
)
|
|
|
|
# Test Redis connection
|
|
try:
|
|
if not redis_client.ping():
|
|
raise redis.exceptions.ConnectionError("Redis ping failed")
|
|
logger.info(f"Successfully connected to Redis at {redis_conn.host}:{redis_conn.port}")
|
|
except redis.exceptions.AuthenticationError:
|
|
logger.error(f"Redis authentication failed for connection '{self.redis_conn_id}'. Check password.")
|
|
raise AirflowException("Redis authentication failed.")
|
|
except redis.exceptions.ConnectionError as e:
|
|
logger.error(f"Could not connect to Redis at {redis_conn.host}:{redis_conn.port}. Error: {e}")
|
|
raise AirflowException(f"Redis connection failed: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected Redis error: {str(e)}")
|
|
raise AirflowException(f"Unexpected Redis error: {e}")
|
|
|
|
# Get service details from Redis with retries and proper key handling
|
|
service_key = f"ytdlp:{account_id}"
|
|
legacy_key = account_id # For backward compatibility
|
|
|
|
host = None
|
|
port = None
|
|
for attempt in range(self.max_retries):
|
|
try:
|
|
logger.info(f"Attempt {attempt + 1}/{self.max_retries}: Fetching service details from Redis for keys: '{service_key}', '{legacy_key}'")
|
|
service_details = redis_client.hgetall(service_key)
|
|
if not service_details:
|
|
logger.warning(f"Key '{service_key}' not found, trying legacy key '{legacy_key}'")
|
|
service_details = redis_client.hgetall(legacy_key)
|
|
|
|
if not service_details:
|
|
raise ValueError(f"No service details found in Redis for keys: {service_key} or {legacy_key}")
|
|
|
|
# Find IP and port, handling potential case differences and byte/string types
|
|
ip_key = next((k for k in service_details if k.lower() == 'ip'), None)
|
|
port_key = next((k for k in service_details if k.lower() == 'port'), None)
|
|
|
|
if not ip_key: raise ValueError(f"'ip' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
if not port_key: raise ValueError(f"'port' key not found in Redis hash for {service_key}/{legacy_key}")
|
|
|
|
host = service_details[ip_key] # Already decoded due to decode_responses=True
|
|
port_str = service_details[port_key]
|
|
|
|
try:
|
|
port = int(port_str)
|
|
except ValueError:
|
|
raise ValueError(f"Invalid port value '{port_str}' found in Redis for {service_key}/{legacy_key}")
|
|
|
|
logger.info(f"Extracted from Redis - Service IP: {host}, Service Port: {port}")
|
|
break # Success
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Attempt {attempt + 1} failed to get Redis details: {str(e)}")
|
|
if attempt == self.max_retries - 1:
|
|
logger.error("Max retries reached for fetching Redis details.")
|
|
raise AirflowException(f"Failed to get service details from Redis after {self.max_retries} attempts: {e}")
|
|
logger.info(f"Retrying in {self.retry_delay} seconds...")
|
|
time.sleep(self.retry_delay)
|
|
else:
|
|
# Direct connection: Log parameter sources
|
|
params = context.get('params', {})
|
|
host = params.get('service_ip', self.service_ip)
|
|
host_source = 'task params' if 'service_ip' in params else 'operator init'
|
|
port_str = params.get('service_port', self.service_port)
|
|
port_source = 'task params' if 'service_port' in params else 'operator init'
|
|
url = params.get('url', self.url)
|
|
url_source = 'task params' if 'url' in params else 'operator init'
|
|
|
|
logger.info(f"Using service_ip={host} (from {host_source})")
|
|
logger.info(f"Using service_port={port_str} (from {port_source})")
|
|
logger.info(f"Using url={url} (from {url_source})")
|
|
|
|
if not host or not port_str:
|
|
raise ValueError("Direct connection requires service_ip and service_port")
|
|
try:
|
|
port = int(port_str)
|
|
except ValueError:
|
|
raise ValueError(f"Invalid service_port value: {port_str}")
|
|
|
|
logger.info(f"Connecting directly to Thrift service at {host}:{port} (Redis bypassed)")
|
|
|
|
# Render and validate timeout
|
|
timeout_param = context.get('params', {}).get('timeout', self.timeout)
|
|
if isinstance(self.timeout, str) and '{{' in self.timeout:
|
|
timeout_rendered = self.render_template(self.timeout, context)
|
|
logger.info(f"Rendered timeout template: '{self.timeout}' -> '{timeout_rendered}'")
|
|
timeout_param = timeout_rendered
|
|
try:
|
|
timeout = int(timeout_param)
|
|
if timeout <= 0: raise ValueError("Timeout must be positive")
|
|
logger.info(f"Using timeout: {timeout} seconds")
|
|
except (ValueError, TypeError):
|
|
logger.warning(f"Invalid timeout value: '{timeout_param}'. Using default: {DEFAULT_TIMEOUT}")
|
|
timeout = DEFAULT_TIMEOUT
|
|
|
|
# Create Thrift connection objects
|
|
socket_conn = TSocket.TSocket(host, port, socket_family=socket.AF_INET) # Explicitly use AF_INET (IPv4)
|
|
socket_conn.setTimeout(timeout * 1000) # Thrift timeout is in milliseconds
|
|
transport = TTransport.TFramedTransport(socket_conn)
|
|
protocol = TBinaryProtocol.TBinaryProtocol(transport)
|
|
client = YTTokenOpService.Client(protocol)
|
|
|
|
logger.info(f"Attempting to connect to Thrift server at {host}:{port}...")
|
|
try:
|
|
transport.open()
|
|
logger.info("Successfully connected to Thrift server.")
|
|
|
|
# Test connection with ping
|
|
try:
|
|
client.ping()
|
|
logger.info("Server ping successful.")
|
|
except Exception as e:
|
|
logger.error(f"Server ping failed: {e}")
|
|
raise AirflowException(f"Server connection test (ping) failed: {e}")
|
|
|
|
# Get token from service with specific error handling
|
|
try:
|
|
url_param = context.get('params', {}).get('url', self.url)
|
|
logger.info(f"Requesting token for accountId='{account_id}', url='{url_param}'")
|
|
token_data = client.getOrRefreshToken(
|
|
accountId=account_id,
|
|
updateType=TokenUpdateMode.AUTO,
|
|
url=url_param
|
|
)
|
|
logger.info("Successfully retrieved token data from service.")
|
|
except PBServiceException as e:
|
|
logger.error(f"PBServiceException occurred: Code={getattr(e, 'errorCode', 'N/A')}, Message={getattr(e, 'message', 'N/A')}")
|
|
error_code = getattr(e, 'errorCode', None)
|
|
error_msg = f"YTDLP service error: {getattr(e, 'message', str(e))}"
|
|
# Handle specific known error codes
|
|
if error_code in [
|
|
"SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT",
|
|
"SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT",
|
|
"SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"
|
|
]:
|
|
error_msg = f"SOCKS5 proxy error ({error_code}): {e.message}. Check proxy settings."
|
|
elif error_code == "BOT_DETECTION":
|
|
error_msg = f"Bot detection triggered ({error_code}): {e.message}."
|
|
suggestions = getattr(e, 'context', {}).get('suggestions', [])
|
|
if suggestions: error_msg += "\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
|
|
elif error_code == "NODEJS_SCRIPT_ERROR":
|
|
error_msg = f"Node.js script error ({error_code}): {e.message}."
|
|
elif error_code == "NODEJS_TIMEOUT":
|
|
error_msg = f"Node.js timeout ({error_code}): {e.message}."
|
|
# Add more specific error handling as needed
|
|
raise AirflowException(error_msg)
|
|
except TTransportException as e:
|
|
logger.error(f"Thrift transport error during getOrRefreshToken: {e}")
|
|
raise AirflowException(f"Transport error during API call: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during getOrRefreshToken: {e}")
|
|
raise AirflowException(f"Unexpected error during API call: {e}")
|
|
|
|
except TTransportException as e:
|
|
# Handle connection-specific transport errors
|
|
if "read 0 bytes" in str(e) or "Could not connect to" in str(e) or "Connection refused" in str(e):
|
|
logger.error(f"Connection failed to {host}:{port}. Details: {e}")
|
|
logger.error("Possible causes: Server down, firewall block, incorrect IP/port.")
|
|
raise AirflowException(f"Failed to connect to YTDLP service at {host}:{port}: {e}")
|
|
else:
|
|
logger.error(f"Thrift transport error during connection: {str(e)}")
|
|
raise AirflowException(f"Transport error connecting to YTDLP service: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during connection or ping: {str(e)}")
|
|
raise # Re-raise other unexpected errors
|
|
|
|
# Log received token data attributes for debugging
|
|
logger.debug(f"Token data received. Attributes: {dir(token_data)}")
|
|
for attr in dir(token_data):
|
|
if not attr.startswith('__') and not callable(getattr(token_data, attr)): # Log non-callable attributes
|
|
value = getattr(token_data, attr)
|
|
if attr == 'infoJson' and value:
|
|
logger.debug(f"infoJson: {value[:50]}...")
|
|
else:
|
|
logger.debug(f"{attr}: {value}")
|
|
|
|
info_json_path = None # Initialize info_json_path
|
|
|
|
save_info_json_param = context['params'].get('save_info_json', self.save_info_json)
|
|
# Render if it's a string template
|
|
if isinstance(save_info_json_param, str):
|
|
save_info_json_rendered = self.render_template(save_info_json_param, context)
|
|
# Convert common string representations to boolean
|
|
save_info_json = str(save_info_json_rendered).lower() in ['true', '1', 't', 'y', 'yes']
|
|
else:
|
|
save_info_json = bool(save_info_json_param)
|
|
|
|
|
|
# Save info.json if requested and valid
|
|
if self.save_info_json:
|
|
info_json = self._get_info_json(token_data)
|
|
if info_json and self._is_valid_json(info_json):
|
|
try:
|
|
# Use internal _save_info_json method which handles rendering, dir creation, logging
|
|
info_json_path = self._save_info_json(context, info_json)
|
|
if info_json_path: # Check if saving was successful
|
|
context['task_instance'].xcom_push(key='info_json_path', value=info_json_path)
|
|
logger.info(f"Successfully saved info.json and pushed path to XCom: {info_json_path}")
|
|
else:
|
|
# _save_info_json should log errors, push None to indicate failure
|
|
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
logger.warning("info.json saving failed (check logs from _save_info_json), pushing None to XCom for info_json_path.")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during info.json saving process: {e}", exc_info=True)
|
|
context['task_instance'].xcom_push(key='info_json_path', value=None) # Push None on error
|
|
elif info_json:
|
|
logger.warning("Retrieved infoJson is not valid JSON. Skipping save.")
|
|
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
else:
|
|
logger.info("No infoJson found in token data. Skipping save.")
|
|
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
else:
|
|
logger.info("save_info_json is False. Skipping info.json save.")
|
|
context['task_instance'].xcom_push(key='info_json_path', value=None)
|
|
|
|
|
|
# Extract and potentially store SOCKS proxy
|
|
socks_proxy = None
|
|
if self.get_socks_proxy: # Use instance attribute
|
|
# Check for common attribute names for proxy
|
|
proxy_attr = next((attr for attr in ['socks5Proxy', 'socksProxy', 'socks'] if hasattr(token_data, attr)), None)
|
|
if proxy_attr:
|
|
socks_proxy = getattr(token_data, proxy_attr)
|
|
if socks_proxy: # Ensure proxy value is not empty
|
|
logger.info(f"Extracted SOCKS proxy ({proxy_attr}): {socks_proxy}")
|
|
if self.store_socks_proxy: # Use instance attribute
|
|
context['task_instance'].xcom_push(key='socks_proxy', value=socks_proxy)
|
|
logger.info(f"Pushed key 'socks_proxy' to XCom with value: {socks_proxy}")
|
|
else:
|
|
logger.info("SOCKS proxy extracted but not pushed to XCom (store_socks_proxy=False).")
|
|
else:
|
|
logger.info(f"Found proxy attribute '{proxy_attr}' but value is empty. No proxy extracted.")
|
|
# Push None even if found but empty, if storing is enabled
|
|
if self.store_socks_proxy: # Use instance attribute
|
|
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
logger.info("Pushed None to XCom for 'socks_proxy' as extracted value was empty.")
|
|
else:
|
|
logger.info("get_socks_proxy is True, but no SOCKS proxy attribute found in token data.")
|
|
# Push None if storing is enabled but attribute not found
|
|
if self.store_socks_proxy: # Use instance attribute
|
|
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
logger.info("Pushed None to XCom for 'socks_proxy' as attribute was not found.")
|
|
else:
|
|
logger.info("get_socks_proxy is False. Skipping proxy extraction.")
|
|
# Push None if storing is enabled but extraction was skipped
|
|
if self.store_socks_proxy: # Use instance attribute
|
|
context['task_instance'].xcom_push(key='socks_proxy', value=None)
|
|
logger.info("Pushed None to XCom for 'socks_proxy' as get_socks_proxy=False.")
|
|
|
|
|
|
# Get the original command from the server
|
|
ytdlp_cmd = getattr(token_data, 'ytdlpCommand', None)
|
|
if not ytdlp_cmd:
|
|
logger.error("No 'ytdlpCommand' attribute found in token data.")
|
|
raise AirflowException("Required 'ytdlpCommand' not received from service.")
|
|
|
|
logger.info(f"Original command received from server: {ytdlp_cmd}")
|
|
|
|
# Log example usage command (DO NOT MODIFY the original command here)
|
|
if info_json_path:
|
|
# Use double quotes for paths/proxy in example for robustness
|
|
example_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
if socks_proxy:
|
|
example_cmd += f" --proxy \"{socks_proxy}\""
|
|
example_cmd += " --verbose --simulate" # Add useful flags for testing
|
|
logger.info(f"\n--- Example usage with saved info.json ---")
|
|
logger.info(example_cmd)
|
|
logger.info(f"(Note: The actual command with tokens/cookies is pushed to XCom as 'ytdlp_command')")
|
|
latest_json_path = os.path.join(os.path.dirname(info_json_path), 'latest.json')
|
|
logger.info(f"(You can also use 'latest.json': {latest_json_path})")
|
|
logger.info(f"-------------------------------------------\n")
|
|
|
|
else:
|
|
logger.info("\n--- Original command pushed to XCom ('ytdlp_command') ---")
|
|
if socks_proxy:
|
|
logger.info(f"Use the extracted proxy '{socks_proxy}' (pushed to XCom if store_socks_proxy=True) with the --proxy flag.")
|
|
logger.info("Add --verbose and --simulate flags for testing the command.")
|
|
logger.info(f"-------------------------------------------------------\n")
|
|
|
|
|
|
# Push the *original* command to XCom
|
|
context['task_instance'].xcom_push(key='ytdlp_command', value=ytdlp_cmd)
|
|
logger.info(f"Pushed original command to XCom key 'ytdlp_command'.")
|
|
|
|
# Note: Returning ytdlp_cmd below implicitly pushes the same value
|
|
# to XCom under the key 'return_value'. Downstream tasks should
|
|
# preferably use the explicitly pushed 'ytdlp_command' key for clarity.
|
|
return ytdlp_cmd # Return the original command
|
|
|
|
except AirflowException as e: # Catch AirflowExceptions raised explicitly in the code above
|
|
logger.error(f"Operation failed due to AirflowException: {e}")
|
|
raise # Re-raise AirflowExceptions to ensure task failure
|
|
except (TTransportException, PBServiceException) as e: # Catch specific Thrift/Service errors not already wrapped
|
|
logger.error(f"Unhandled Thrift/Service error: {e}", exc_info=True) # Add traceback for context
|
|
raise AirflowException(f"Unhandled YTDLP service error: {e}") # Wrap in AirflowException
|
|
except Exception as e: # General catch-all for truly unexpected errors
|
|
# Log with traceback for unexpected errors
|
|
logger.error(f"Caught unexpected error in YtdlpOpsOperator: {e}", exc_info=True)
|
|
# Ensure any unexpected error explicitly fails the task with AirflowException
|
|
raise AirflowException(f"Unexpected error caused task failure: {e}")
|
|
finally:
|
|
if transport and transport.isOpen(): # Check if transport exists and is open before closing
|
|
logger.info("Closing Thrift transport.")
|
|
transport.close()
|
|
|
|
# --- Helper Methods ---
|
|
|
|
def _get_info_json(self, token_data):
|
|
"""Safely extracts infoJson from token data."""
|
|
info_json = getattr(token_data, 'infoJson', None)
|
|
if info_json:
|
|
logger.debug("Extracted infoJson from token data.")
|
|
else:
|
|
logger.debug("No infoJson attribute found in token data.")
|
|
return info_json
|
|
|
|
def _is_valid_json(self, json_str):
|
|
"""Checks if a string is valid JSON."""
|
|
if not json_str or not isinstance(json_str, str):
|
|
logger.debug("Input is not a non-empty string, considered invalid JSON.")
|
|
return False
|
|
try:
|
|
json.loads(json_str)
|
|
logger.debug("JSON string validation successful.")
|
|
return True
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON validation failed: {e}")
|
|
return False
|
|
|
|
def _save_info_json(self, context, info_json):
|
|
"""Saves info_json to a file, handling directory creation and logging. Returns the path on success, None on failure."""
|
|
try:
|
|
# Get URL from params/context for video ID extraction
|
|
url_param = context.get('params', {}).get('url', self.url)
|
|
video_id = self._extract_video_id(url_param) # Use internal helper
|
|
|
|
# Render the info_json_dir template
|
|
save_dir_template = self.info_json_dir or "." # Default to current dir if template is None or empty string
|
|
save_dir = self.render_template(save_dir_template, context)
|
|
if not save_dir: # Handle case where template renders to empty string
|
|
logger.warning(f"Rendered info_json_dir template '{save_dir_template}' resulted in an empty path. Defaulting to '.'")
|
|
save_dir = "."
|
|
logger.info(f"Target directory for info.json (rendered): {save_dir}")
|
|
|
|
# Ensure directory exists
|
|
try:
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
logger.info(f"Ensured directory exists: {save_dir}")
|
|
except OSError as e:
|
|
logger.error(f"Could not create directory {save_dir}: {e}. Cannot save info.json.")
|
|
return None # Indicate failure
|
|
|
|
# Construct filename (using potentially overridden account_id)
|
|
account_id_param = context.get('params', {}).get('account_id', self.account_id)
|
|
timestamp = int(time.time())
|
|
base_filename = f"info_{video_id}_{account_id_param}_{timestamp}.json" if video_id else f"info_{account_id_param}_{timestamp}.json"
|
|
info_json_path = os.path.join(save_dir, base_filename)
|
|
latest_json_path = os.path.join(save_dir, "latest.json") # Path for the latest symlink/copy
|
|
|
|
# Write to timestamped file
|
|
try:
|
|
logger.info(f"Writing info.json content (received from service) to {info_json_path}...")
|
|
with open(info_json_path, 'w', encoding='utf-8') as f:
|
|
f.write(info_json)
|
|
logger.info(f"Successfully saved info.json to timestamped file: {info_json_path}")
|
|
except IOError as e:
|
|
logger.error(f"Failed to write info.json to {info_json_path}: {e}")
|
|
return None # Indicate failure
|
|
|
|
# Write to latest.json (overwrite) - best effort
|
|
try:
|
|
with open(latest_json_path, 'w', encoding='utf-8') as f:
|
|
f.write(info_json)
|
|
logger.info(f"Updated latest.json file: {latest_json_path}")
|
|
except IOError as e:
|
|
# Log warning but don't fail the whole save if only latest.json fails
|
|
logger.warning(f"Failed to update latest.json at {latest_json_path}: {e}")
|
|
|
|
return info_json_path # Return path on success (even if latest.json failed)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in _save_info_json: {e}", exc_info=True)
|
|
return None # Indicate failure
|
|
|
|
def _extract_video_id(self, url):
|
|
"""Extracts YouTube video ID from URL (internal helper)."""
|
|
if not url or not isinstance(url, str):
|
|
logger.debug("URL is empty or not a string, cannot extract video ID.")
|
|
return None
|
|
try:
|
|
# Basic extraction logic (can be enhanced for more URL types)
|
|
video_id = None
|
|
if 'youtube.com/watch?v=' in url:
|
|
video_id = url.split('v=')[1].split('&')[0]
|
|
elif 'youtu.be/' in url:
|
|
video_id = url.split('youtu.be/')[1].split('?')[0]
|
|
|
|
# Ensure it looks like a video ID (typically 11 chars, but can vary)
|
|
if video_id and len(video_id) >= 11:
|
|
video_id = video_id[:11] # Take first 11 chars as standard ID length
|
|
logger.debug(f"Extracted video ID '{video_id}' from URL: {url}")
|
|
return video_id
|
|
else:
|
|
logger.debug(f"Could not extract a standard video ID pattern from URL: {url}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract video ID from URL '{url}'. Error: {e}")
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# Python Callables for Tasks
|
|
# =============================================================================
|
|
|
|
def display_token_info(**context):
|
|
"""Displays token info from XCom, parses info.json, and logs example commands."""
|
|
ti = context['task_instance']
|
|
logger.info("Starting display_token_info task.")
|
|
|
|
# Pull data from XCom (provide default values)
|
|
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy')
|
|
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
|
|
logger.info("\n=== Pulled Token Information from XCom ===")
|
|
logger.info(f"Info.json path: {info_json_path or 'Not found/Not saved'}")
|
|
logger.info(f"SOCKS Proxy: {socks_proxy or 'Not found/Not extracted'}")
|
|
logger.info(f"Original yt-dlp command (with tokens): {ytdlp_command or 'Not found'}")
|
|
|
|
result = {
|
|
'info_path': info_json_path,
|
|
'proxy': socks_proxy,
|
|
'ytdlp_command': ytdlp_command,
|
|
'video_info': None,
|
|
'commands': {},
|
|
'error': None
|
|
}
|
|
|
|
if info_json_path and os.path.exists(info_json_path):
|
|
logger.info(f"\n=== Processing Video Information from: {info_json_path} ===")
|
|
try:
|
|
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
info = json.load(f)
|
|
|
|
# Extract and log basic video info safely
|
|
title = info.get('title', 'Unknown Title')
|
|
uploader = info.get('uploader', 'Unknown Author')
|
|
duration = info.get('duration_string', 'Unknown Length')
|
|
upload_date_str = info.get('upload_date') # Format: YYYYMMDD
|
|
upload_date_formatted = 'Unknown Date'
|
|
if upload_date_str:
|
|
try:
|
|
# Validate format before parsing
|
|
if len(upload_date_str) == 8 and upload_date_str.isdigit():
|
|
upload_date_formatted = datetime.strptime(upload_date_str, '%Y%m%d').strftime('%Y-%m-%d')
|
|
else:
|
|
logger.warning(f"Upload date '{upload_date_str}' is not in YYYYMMDD format.")
|
|
except ValueError:
|
|
logger.warning(f"Could not parse upload_date '{upload_date_str}'")
|
|
|
|
result['video_info'] = {
|
|
'title': title,
|
|
'uploader': uploader,
|
|
'upload_date': upload_date_formatted, # Store formatted date
|
|
'duration': duration
|
|
}
|
|
|
|
logger.info(f"Title: {title}")
|
|
logger.info(f"Author: {uploader}")
|
|
logger.info(f"Date: {upload_date_formatted}")
|
|
logger.info(f"Length: {duration}")
|
|
|
|
logger.info("\n=== Example yt-dlp Commands (using saved info.json) ===")
|
|
base_cmd = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
if socks_proxy:
|
|
base_cmd += f" --proxy \"{socks_proxy}\""
|
|
|
|
# Command to list formats
|
|
format_cmd = f"{base_cmd} -F"
|
|
result['commands']['format'] = format_cmd
|
|
logger.info(f"List formats command: {format_cmd}")
|
|
|
|
# Execute and log the format listing command
|
|
logger.info("\n--- Executing Format List Command ---")
|
|
try:
|
|
# Use os.popen for simplicity, capture output
|
|
logger.info(f"Running: {format_cmd}")
|
|
format_output = os.popen(format_cmd).read()
|
|
logger.info("--- Format List Output ---")
|
|
logger.info(format_output)
|
|
logger.info("--------------------------")
|
|
except Exception as e:
|
|
logger.error(f"Error executing format command: {e}")
|
|
|
|
# Command to simulate download
|
|
simulate_cmd = f"{base_cmd} --simulate --verbose" # Add verbose for more info
|
|
result['commands']['simulate'] = simulate_cmd
|
|
logger.info(f"Simulate download command: {simulate_cmd}")
|
|
|
|
# Execute and log the simulation command
|
|
logger.info("\n--- Executing Simulation Command ---")
|
|
try:
|
|
logger.info(f"Running: {simulate_cmd}")
|
|
simulate_output = os.popen(simulate_cmd).read()
|
|
logger.info("--- Simulation Output ---")
|
|
logger.info(simulate_output)
|
|
logger.info("-------------------------")
|
|
except Exception as e:
|
|
logger.error(f"Error executing simulation command: {e}")
|
|
|
|
# Basic download command
|
|
download_cmd = base_cmd
|
|
result['commands']['download_base'] = download_cmd
|
|
logger.info(f"Base download command (add format selection, output path): {download_cmd}")
|
|
|
|
# Push generated example commands to XCom for potential downstream use
|
|
# ti.xcom_push(key='format_cmd', value=format_cmd) # Removed as requested
|
|
# ti.xcom_push(key='simulate_cmd', value=simulate_cmd) # Removed as requested
|
|
ti.xcom_push(key='download_cmd', value=download_cmd)
|
|
logger.info(f"Pushed key 'download_cmd' to XCom with value: {download_cmd}")
|
|
|
|
except json.JSONDecodeError as e:
|
|
error_msg = f"Failed to parse info.json file '{info_json_path}': {e}"
|
|
logger.error(error_msg)
|
|
result['error'] = error_msg
|
|
except FileNotFoundError:
|
|
error_msg = f"Info.json file not found at path: {info_json_path}"
|
|
logger.error(error_msg)
|
|
result['error'] = error_msg
|
|
except Exception as e:
|
|
error_msg = f"Error processing info.json file '{info_json_path}': {str(e)}"
|
|
logger.error(error_msg, exc_info=True)
|
|
result['error'] = error_msg
|
|
elif info_json_path:
|
|
error_msg = f"Info.json path provided ('{info_json_path}') but file does not exist."
|
|
logger.warning(error_msg)
|
|
result['error'] = error_msg
|
|
else:
|
|
logger.warning("No info.json path found in XCom. Cannot display video details or generate example commands.")
|
|
result['error'] = "Info.json path not available."
|
|
|
|
logger.info("Finished display_token_info task.")
|
|
# Return the collected information (useful if used as a PythonOperator return value)
|
|
return json.dumps(result) # Return as JSON string for XCom compatibility if needed
|
|
|
|
|
|
def store_token_info(**context):
|
|
"""Stores retrieved token information (command, proxy, info.json) in Redis."""
|
|
ti = context['task_instance']
|
|
# Use the redis_conn_id defined in the operator/DAG params if possible, else default
|
|
redis_conn_id = context['params'].get('redis_conn_id', 'redis_default')
|
|
redis_hook = RedisHook(redis_conn_id=redis_conn_id)
|
|
logger.info(f"Starting store_token_info task using Redis connection '{redis_conn_id}'.")
|
|
|
|
try:
|
|
# Pull necessary data from XCom and context
|
|
url = context['params'].get('url')
|
|
if not url:
|
|
# Attempt to get URL from DAG run conf as fallback
|
|
url = context.get('dag_run', {}).conf.get('url')
|
|
if not url:
|
|
raise ValueError("URL parameter is missing in context['params'] and dag_run.conf")
|
|
logger.warning("URL parameter missing in context['params'], using URL from dag_run.conf.")
|
|
|
|
|
|
ytdlp_command = ti.xcom_pull(task_ids='get_token', key='ytdlp_command')
|
|
socks_proxy = ti.xcom_pull(task_ids='get_token', key='socks_proxy') or '' # Default to empty string if None
|
|
info_json_path = ti.xcom_pull(task_ids='get_token', key='info_json_path')
|
|
|
|
if not ytdlp_command:
|
|
logger.warning("ytdlp_command not found in XCom. Storing empty value.")
|
|
ytdlp_command = '' # Store empty if not found
|
|
|
|
# Construct the base command using info.json
|
|
ytdlp_command_base = ''
|
|
if info_json_path and os.path.exists(info_json_path):
|
|
ytdlp_command_base = f"yt-dlp --load-info-json \"{info_json_path}\""
|
|
logger.info(f"Constructed base command: {ytdlp_command_base}")
|
|
else:
|
|
logger.warning("Cannot construct base command: info_json_path not valid.")
|
|
|
|
# Construct the command with tokens and proxy
|
|
ytdlp_command_tokens = ytdlp_command # Start with original command from server
|
|
if socks_proxy:
|
|
ytdlp_command_tokens += f" --proxy \"{socks_proxy}\""
|
|
logger.info("Appended proxy to token command.")
|
|
|
|
data_to_store = {
|
|
'url': url,
|
|
'ytdlp_command': ytdlp_command_base, # Store the base command
|
|
'proxy': socks_proxy,
|
|
'info_json_path': info_json_path or '' # Store path even if None/empty
|
|
# 'info_json' will be added below
|
|
}
|
|
|
|
# Read info.json content if path exists
|
|
info_json_content = None
|
|
if info_json_path and os.path.exists(info_json_path):
|
|
try:
|
|
with open(info_json_path, 'r', encoding='utf-8') as f:
|
|
# Read and immediately validate JSON structure before storing
|
|
info_json_content = json.load(f)
|
|
# Store the validated JSON as a string
|
|
data_to_store['info_json'] = json.dumps(info_json_content)
|
|
logger.info(f"Read and validated info.json content from: {info_json_path}")
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse info.json file '{info_json_path}' as JSON: {e}. Storing empty content.")
|
|
data_to_store['info_json'] = '' # Store empty string on parse error
|
|
except Exception as e:
|
|
logger.error(f"Failed to read info.json file '{info_json_path}': {e}. Storing empty content.")
|
|
data_to_store['info_json'] = '' # Store empty string on other read errors
|
|
else:
|
|
logger.warning(f"info_json_path ('{info_json_path}') not found or invalid. Storing without info_json content.")
|
|
data_to_store['info_json'] = '' # Store empty string if no path
|
|
|
|
# Determine Redis key using video ID
|
|
# Use the same helper method as the operator for consistency
|
|
# Need an instance or static method call. Let's make _extract_video_id static temporarily
|
|
# Or instantiate the operator just for this - less ideal.
|
|
# Simplest: Re-implement or assume utils.
|
|
# Re-implementing basic logic here for simplicity:
|
|
video_id = None
|
|
try:
|
|
if 'youtube.com/watch?v=' in url:
|
|
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
elif 'youtu.be/' in url:
|
|
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
except Exception:
|
|
pass # Ignore errors in ID extraction for key generation
|
|
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
logger.info(f"Determined Redis key: {redis_key}")
|
|
|
|
# Store data in Redis hash
|
|
# Log presence/absence rather than full content for potentially large fields
|
|
logger.info(f"Data to store in Redis key '{redis_key}': "
|
|
f"URL='{data_to_store['url']}', "
|
|
f"Command={'<present>' if data_to_store['ytdlp_command'] else '<empty>'}, "
|
|
f"Proxy='{data_to_store['proxy'] or '<empty>'}', "
|
|
f"Path='{data_to_store['info_json_path'] or '<empty>'}', "
|
|
f"JSON Content={'<present>' if data_to_store.get('info_json') else '<empty>'}")
|
|
|
|
with redis_hook.get_conn() as redis_client:
|
|
# Extract video ID from URL
|
|
video_id = None
|
|
try:
|
|
if 'youtube.com/watch?v=' in url:
|
|
video_id = url.split('v=')[1].split('&')[0][:11]
|
|
elif 'youtu.be/' in url:
|
|
video_id = url.split('youtu.be/')[1].split('?')[0][:11]
|
|
except Exception:
|
|
pass # Ignore errors in ID extraction for key generation
|
|
|
|
# Use video ID as part of the Redis key
|
|
redis_key = f"token_info:{video_id or 'unknown'}"
|
|
logger.info(f"Determined Redis key: {redis_key}")
|
|
|
|
# Store data in Redis hash
|
|
# Add video_id, timestamp, and the constructed ytdlp_command_tokens
|
|
data_to_store['video_id'] = video_id or 'unknown'
|
|
data_to_store['timestamp'] = int(time.time())
|
|
data_to_store['ytdlp_command_tokens'] = ytdlp_command_tokens # Store the original token command
|
|
|
|
# Log fields being stored
|
|
log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
logger.info(f"Storing in Redis key '{redis_key}': {log_data}")
|
|
|
|
redis_client.hset(redis_key, mapping=data_to_store)
|
|
# Set expiration (e.g., 24 hours = 86400 seconds)
|
|
redis_client.expire(redis_key, 86400)
|
|
logger.info(f"Successfully stored token info in Redis key '{redis_key}' with 24h expiration.")
|
|
# Log the final stored data again for clarity
|
|
final_log_data = {k: (f"<{len(v)} bytes>" if isinstance(v, str) and len(v) > 100 else v) for k, v in data_to_store.items()}
|
|
logger.info(f"--- Final Data Stored in Redis Key '{redis_key}' ---")
|
|
logger.info(final_log_data)
|
|
logger.info("----------------------------------------------------")
|
|
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to store token info in Redis: {e}", exc_info=True)
|
|
# Re-raise as AirflowException to fail the task
|
|
raise AirflowException(f"Failed to store token info in Redis: {e}")
|
|
|
|
logger.info("Finished store_token_info task.")
|
|
|
|
|
|
# =============================================================================
|
|
# DAG Definition
|
|
# =============================================================================
|
|
|
|
# Update default_args to match ytdlp_client_dag.py structure
|
|
default_args = {
|
|
'owner': 'airflow',
|
|
'depends_on_past': False,
|
|
'email_on_failure': False, # Match reference DAG
|
|
'email_on_retry': False, # Match reference DAG
|
|
'retries': 1, # Default task retries
|
|
'retry_delay': timedelta(minutes=5), # Standard task retry delay
|
|
'start_date': days_ago(1) # Best practice start date
|
|
}
|
|
|
|
# Update DAG definition
|
|
with DAG(
|
|
dag_id='ytdlp_client_dag_v2.1',
|
|
default_args=default_args,
|
|
schedule_interval=None, # Manually triggered DAG
|
|
catchup=False, # Don't run for past missed schedules
|
|
description='DAG for YTDLP operations using Thrift client (V2 - Refactored)', # Updated description
|
|
tags=['ytdlp', 'thrift', 'client', 'v2'], # Updated tags for better filtering
|
|
params={
|
|
# Define DAG parameters with defaults and types for UI clarity
|
|
'url': Param('https://www.youtube.com/watch?v=sOlTX9uxUtM', type=["null", "string"], description="Required: The video URL to process."), # Default URL
|
|
'redis_enabled': Param(False, type="boolean", description="Use Redis for service discovery? If False, uses service_ip/port."), # Default to direct connection
|
|
'service_ip': Param('85.192.30.55', type="string", description="Service IP if redis_enabled=False."), # Default service IP
|
|
'service_port': Param(9090, type="integer", description="Service port if redis_enabled=False."), # Default service port
|
|
'account_id': Param('account_fr_2025-04-03T1220_anonomyous_2ssdfsf2342afga09', type="string", description="Account ID for Redis lookup or direct call."), # Updated default account_id
|
|
'timeout': Param(DEFAULT_TIMEOUT, type="integer", description="Timeout in seconds for the Thrift connection."),
|
|
# Use Airflow Variable for downloads directory, matching reference DAG structure
|
|
'info_json_dir': Param("{{ var.value.get('DOWNLOADS_TEMP', '/opt/airflow/downloadfiles') }}", type="string", description="Directory to save info.json. Uses Airflow Variable 'DOWNLOADS_TEMP' or default.")
|
|
}
|
|
) as dag:
|
|
|
|
# Define Tasks
|
|
|
|
get_token = YtdlpOpsOperator(
|
|
task_id='get_token',
|
|
# Pass templated parameters from DAG run config
|
|
url="{{ params.url }}",
|
|
redis_enabled="{{ params.redis_enabled }}",
|
|
service_ip="{{ params.service_ip }}",
|
|
service_port="{{ params.service_port }}",
|
|
account_id="{{ params.account_id }}",
|
|
save_info_json=True,
|
|
info_json_dir="{{ params.info_json_dir }}",
|
|
get_socks_proxy=True,
|
|
store_socks_proxy=True,
|
|
timeout="{{ params.timeout }}",
|
|
retries=MAX_RETRIES, # Operator-specific retries if needed, else use DAG default
|
|
retry_delay=RETRY_DELAY, # Operator-specific delay if needed
|
|
# Add callbacks for logging success/failure, similar to reference DAG
|
|
on_failure_callback=lambda context: logger.error(f"Task {context['task_instance_key_str']} failed."),
|
|
on_success_callback=lambda context: logger.info(f"Task {context['task_instance_key_str']} succeeded.")
|
|
)
|
|
# Add task documentation (visible in Airflow UI)
|
|
get_token.doc_md = """
|
|
### Get Token Task
|
|
Connects to the YTDLP Thrift service (either directly or via Redis discovery)
|
|
to retrieve an authentication token and video metadata (info.json).
|
|
|
|
**Pushes to XCom:**
|
|
- `info_json_path`: Path to the saved info.json file (or None if not saved/failed).
|
|
- `socks_proxy`: The extracted SOCKS proxy string (or None if not requested/found).
|
|
- `ytdlp_command`: The original command string received from the server (contains tokens/cookies).
|
|
|
|
- Uses parameters defined in the DAG run configuration.
|
|
"""
|
|
|
|
# Optional: Add a task to explicitly check XComs for debugging (like in reference DAG)
|
|
def _check_xcom_callable(**context):
|
|
"""Logs XCom values pushed by the get_token task."""
|
|
ti = context['task_instance']
|
|
logger.info("--- Checking XCom values pushed by get_token ---")
|
|
keys_to_check = ['info_json_path', 'socks_proxy', 'ytdlp_command']
|
|
xcom_values = {}
|
|
for key in keys_to_check:
|
|
value = ti.xcom_pull(task_ids='get_token', key=key)
|
|
xcom_values[key] = value
|
|
# Avoid logging potentially sensitive command details fully in production
|
|
if key == 'ytdlp_command' and value:
|
|
log_value = f"{value[:50]}..." # Log truncated command
|
|
else:
|
|
log_value = value
|
|
logger.info(f"XCom key='{key}': {log_value}")
|
|
logger.info("----------------------------------------------")
|
|
return xcom_values # Return values for potential future use
|
|
|
|
check_xcom_task = PythonOperator(
|
|
task_id='check_xcom_after_get_token',
|
|
python_callable=_check_xcom_callable,
|
|
)
|
|
check_xcom_task.doc_md = "Logs the values pushed to XCom by the 'get_token' task for debugging purposes."
|
|
|
|
display_info = PythonOperator(
|
|
task_id='display_token_info',
|
|
python_callable=display_token_info,
|
|
trigger_rule='all_success'
|
|
)
|
|
display_info.doc_md = """
|
|
### Display Token Info Task
|
|
Pulls information from XCom, parses the `info.json` file (if available),
|
|
logs video details, and generates example `yt-dlp` commands.
|
|
|
|
**Pulls from XCom (task_id='get_token'):**
|
|
- `info_json_path`
|
|
- `socks_proxy`
|
|
- `ytdlp_command`
|
|
|
|
**Pushes to XCom:**
|
|
- `download_cmd`: Base command using `--load-info-json` (user needs to add format/output).
|
|
"""
|
|
|
|
store_info = PythonOperator(
|
|
task_id='store_token_info', # Use consistent task ID naming
|
|
python_callable=store_token_info,
|
|
)
|
|
store_info.doc_md = """
|
|
### Store Token Info Task
|
|
Pulls information from XCom and DAG parameters, reads the `info.json` content,
|
|
and stores relevant data in a Redis hash.
|
|
|
|
**Pulls from XCom (task_id='get_token'):**
|
|
- `ytdlp_command`
|
|
- `socks_proxy`
|
|
- `info_json_path`
|
|
|
|
**Pulls from DAG context:**
|
|
- `params['url']` (or `dag_run.conf['url']`)
|
|
|
|
**Stores in Redis Hash (key: `token_info:<video_id>`):**
|
|
- `url`: The video URL.
|
|
- `ytdlp_command`: Base command using `--load-info-json`.
|
|
- `proxy`: The SOCKS proxy string.
|
|
- `info_json_path`: Path to the saved info.json file.
|
|
- `info_json`: The full content of the info.json file (as a JSON string).
|
|
- `video_id`: Extracted video ID.
|
|
- `timestamp`: Unix timestamp of storage.
|
|
- `ytdlp_command_tokens`: The original command string from the server (contains tokens/cookies).
|
|
|
|
Sets a 24-hour expiration on the Redis key.
|
|
"""
|
|
|
|
# Define task dependencies matching the reference DAG structure
|
|
get_token >> check_xcom_task >> display_info >> store_info
|