#!/usr/bin/env python3 from typing import Dict, List, Optional, Any import argparse import csv import datetime import json import os import re import subprocess import sys import time import uuid import traceback import logging import signal from pathlib import Path from tabulate import tabulate import yt_dlp def signal_handler(sig: int, frame) -> None: """Handle shutdown signals gracefully.""" logger.info(f"Received signal {sig}, shutting down...") # Clean up any resources here sys.exit(0) # Register signal handlers signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # Import the patch for Thrift exceptions try: import os from thrift_exceptions_patch import patch_thrift_exceptions # Explicitly call the patch function to ensure it's applied patch_thrift_exceptions() print("Applied Thrift exceptions patch for compatibility") if 'AIRFLOW_HOME' in os.environ: print("Running in Airflow environment - patch is essential") else: print("Not running in Airflow environment, but patch applied anyway for consistency") except ImportError: print("Could not import thrift_exceptions_patch, compatibility may be affected") print("If running in Airflow, this may cause 'immutable instance' errors") except Exception as e: print(f"Error applying Thrift exceptions patch: {e}") # --- Python Path Setup --- # Ensure the script can find necessary modules, especially Thrift-generated code. # Assumes the script is run from the project root or the path is adjusted accordingly. project_root = Path(__file__).parent.absolute() gen_py_dir = project_root / "thrift_model" / "gen_py" # Add project root to sys.path (needed for the 'pangramia' symlink) if str(project_root) not in sys.path: sys.path.insert(0, str(project_root)) # Verify paths for debugging # print("Project Root:", project_root) # print("Project Root:", project_root) # print("Gen Py Dir:", gen_py_dir) # print("Sys Path:", sys.path) # --- End Python Path Setup --- from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol try: from pangramia.yt.tokens_ops import YTTokenOpService from pangramia.yt.common.ttypes import JobTokenData, TokenUpdateMode, JobState from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException except ImportError as e: print(f"Error importing Thrift-generated modules: {e}") print("Please ensure you have run './generate-thrift.py' successfully from the project root.") print(f"Current sys.path includes: {gen_py_dir}") sys.exit(1) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('ytdlp_ops_client.log') ] ) logger = logging.getLogger(__name__) def get_info_json(token_data): """Get infoJson from token_data""" if not hasattr(token_data, 'infoJson'): logger.error("infoJson attribute missing in token_data") raise ValueError("Server response missing infoJson") if not token_data.infoJson or token_data.infoJson == "{}": logger.error("Empty infoJson received from server") raise ValueError("Empty infoJson received from server") logger.info(f"Using infoJson from server response ({len(token_data.infoJson)} bytes)") return token_data.infoJson def is_valid_json(json_str): """Check if a string is valid JSON and not empty""" if not json_str or json_str == "{}" or json_str == "": logger.warning("Empty JSON string received") return False try: data = json.loads(json_str) # Check if it's an empty object if isinstance(data, dict) and not data: logger.warning("Empty JSON object received") return False # Check if it has an error field if isinstance(data, dict) and ('error' in data or 'errorCode' in data): # It's valid JSON but contains an error logger.warning(f"JSON contains error: {data.get('error', 'Unknown error')} (code: {data.get('errorCode', 'none')})") return True # Check if it has at least some basic fields if isinstance(data, dict) and ('id' in data or 'title' in data): logger.info(f"Valid JSON with video data: {data.get('title', 'Unknown title')}") return True # Check if it has token_data which is important if isinstance(data, dict) and 'token_data' in data and data['token_data']: logger.info("Valid JSON with token_data") return True logger.warning("JSON is valid but missing expected fields") return True except json.JSONDecodeError as e: logger.warning(f"Invalid JSON: {e}") return False except Exception as e: logger.warning(f"Unexpected error validating JSON: {e}") return False def extract_video_id(url: str) -> Optional[str]: """Extract video ID from a YouTube URL.""" # If it's already a video ID if re.match(r'^[a-zA-Z0-9_-]{11}$', url): return url # Handle youtu.be URLs youtu_be_match = re.search(r'youtu\.be/([a-zA-Z0-9_-]{11})', url) if youtu_be_match: return youtu_be_match.group(1) # Handle youtube.com URLs youtube_match = re.search(r'(?:youtube\.com/(?:watch\?v=|embed/|v/)|youtube\.com/.*[?&]v=)([a-zA-Z0-9_-]{11})', url) if youtube_match: return youtube_match.group(1) # Handle shorts URLs shorts_match = re.search(r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})', url) if shorts_match: return shorts_match.group(1) return None def list_available_formats(url: str, args: argparse.Namespace) -> Optional[List[Dict[str, Any]]]: """List available formats for a YouTube video.""" ydl_opts = { 'quiet': not args.no_quiet if hasattr(args, 'no_quiet') else True, 'no_warnings': True, 'skip_download': True, 'extract_flat': True, } try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) if not info: logger.error("Could not retrieve video information") return None formats = info.get('formats', []) if not formats: logger.warning("No formats available for this video") return None # Create a table of available formats format_table = [] for f in formats: format_table.append({ 'format_id': f.get('format_id', 'unknown'), 'ext': f.get('ext', 'unknown'), 'resolution': f.get('resolution', 'unknown'), 'fps': f.get('fps', 'unknown'), 'vcodec': f.get('vcodec', 'unknown'), 'acodec': f.get('acodec', 'unknown'), 'filesize': f.get('filesize', 'unknown'), 'format_note': f.get('format_note', '') }) return format_table except Exception as e: logger.error(f"Error listing formats: {e}") return None def suggest_best_formats(formats: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Suggest best formats based on resolution and codec.""" best = [] seen_resolutions = set() # Prioritize higher resolutions and certain codecs preferred_codecs = ["vp9", "avc1", "av01"] # In order of preference for f in sorted(formats, key=lambda x: ( -int(x.get('height', 0) or 0), # Higher resolution first preferred_codecs.index(x.get('vcodec', '').split('.')[0]) if x.get('vcodec', '').split('.')[0] in preferred_codecs else float('inf'), # Preferred codecs x.get('filesize', 0) or 0 # Smaller filesize )): resolution = f.get('resolution') if resolution and resolution not in seen_resolutions: best.append(f) seen_resolutions.add(resolution) if len(best) >= 3: # Suggest up to 3 formats break return best def load_info_json(path: str) -> Optional[Dict[str, Any]]: """Load and validate info.json file.""" try: path = Path(path).resolve() if not path.exists(): logger.error(f"Info.json file not found: {path}") return None with open(path, 'r') as f: data = json.load(f) # Basic validation if not isinstance(data, dict): logger.error("Invalid info.json format: not a JSON object") return None if 'id' not in data: logger.warning("Info.json missing video ID") return data except Exception as e: logger.error(f"Error loading info.json: {e}") return None def save_info_json(info_json: str, video_id: str, context_dir: str) -> Optional[str]: """Save info.json to disk and return the saved path.""" try: # Ensure context directory exists Path(context_dir).mkdir(parents=True, exist_ok=True) # Create filename with video ID and timestamp timestamp = int(time.time()) output_path = Path(context_dir) / f"info_json_{video_id}_{timestamp}.json" # Write the file with open(output_path, 'w') as f: f.write(info_json) # Also create a symlink or copy to the standard name for compatibility standard_path = Path(context_dir) / f"info_json_{video_id}.json" try: # Try to create a symlink first (more efficient) if os.path.exists(standard_path): os.remove(standard_path) os.symlink(output_path, standard_path) except (OSError, AttributeError): # If symlink fails (e.g., on Windows), make a copy with open(standard_path, 'w') as f: f.write(info_json) # Save latest.json latest_path = Path(context_dir) / "latest.json" with open(latest_path, 'w') as f: f.write(info_json) logger.info(f"Successfully saved info.json to {output_path} and latest.json to {latest_path}") return str(output_path) except Exception as e: logger.error(f"Failed to save info.json: {e}") logger.error(traceback.format_exc()) return False def main(): # Create main parser parser = argparse.ArgumentParser(description='''YtdlpOpsService Client This client connects to the YTDLP Operations Server to generate tokens for YouTube videos. The server performs SOCKS5 proxy connection testing with a 9-second timeout for early detection of proxy issues. If a proxy connection fails, the server will immediately stop token generation and return an error instead of trying other clients.''') # Add global options parser.add_argument('--host', default=os.getenv('YTDLP_HOST', 'localhost'), help='Server host (default: localhost or YTDLP_HOST env)') parser.add_argument('--port', type=int, default=int(os.getenv('YTDLP_PORT', '9090')), help='Server port (default: 9090 or YTDLP_PORT env)') parser.add_argument('--timeout', type=int, default=30000, help='Timeout in milliseconds (default: 30000)') parser.add_argument('--timeout-sec', type=int, default=30, help='Timeout in seconds (default: 30, overrides --timeout if provided)') parser.add_argument('--context-dir', default='.', help='Context directory to save info.json (default: .)') parser.add_argument('--load-info-json', help='Path to existing info.json file to load') parser.add_argument('--framed-transport', action='store_true', help='Use TFramedTransport instead of TBufferedTransport for handling very large messages') parser.add_argument('--force-framed-transport', action='store_true', help='Force the use of TFramedTransport (recommended for large messages)') # Create subparsers for commands subparsers = parser.add_subparsers(dest='command', required=True, help='Commands') # getToken command get_token_parser = subparsers.add_parser('getToken', help='Get token for a YouTube URL', description='''Get token for a YouTube URL This command connects to the server to generate tokens for a YouTube video. The server will test any configured SOCKS5 proxy with a 9-second timeout. If the proxy connection fails, token generation will stop immediately with an error.''') get_token_parser.add_argument('--url', required=True, help='YouTube URL to process') # --format removed, format/quality is determined by the server or embedded in the command get_token_parser.add_argument('--account_id', default='default', help='Account ID (default: default)') get_token_parser.add_argument('--list-formats', action='store_true', help='List available formats for the video') args = parser.parse_args() # Handle info.json loading if args.load_info_json: info_json = load_info_json(args.load_info_json) if info_json: print("Loaded info.json:") print(json.dumps(info_json, indent=2)) return transport = None try: # Ensure context directory exists and is writable try: Path(args.context_dir).mkdir(parents=True, exist_ok=True) test_file = Path(args.context_dir) / "test.txt" test_file.touch() test_file.unlink() except Exception as e: logger.error(f"Could not access context directory {args.context_dir}: {e}") print(f"Error: Could not access context directory {args.context_dir}") sys.exit(1) try: # Check if we should use framed transport for very large messages use_framed_transport = args.framed_transport or args.force_framed_transport or os.environ.get('USE_FRAMED_TRANSPORT', '').lower() in ('1', 'true', 'yes') logger.debug(f"Using framed transport: {use_framed_transport}") # Changed to DEBUG # Create socket with configurable timeout, force IPv4 socket = TSocket.TSocket(args.host, args.port, socket_family=2) # AF_INET = 2 for IPv4 # Use timeout-sec if provided, otherwise use timeout in milliseconds if args.timeout_sec is not None: socket.setTimeout(args.timeout_sec * 1000) # Convert seconds to milliseconds logger.debug(f"Using timeout of {args.timeout_sec} seconds") # Changed to DEBUG else: socket.setTimeout(args.timeout) # Use timeout from CLI in milliseconds logger.debug(f"Using timeout of {args.timeout} milliseconds") # Changed to DEBUG # Always use TFramedTransport to match the server transport = TTransport.TFramedTransport(socket) logger.debug("Using TFramedTransport for large messages") # Changed to DEBUG protocol = TBinaryProtocol.TBinaryProtocol(transport) client = YTTokenOpService.Client(protocol) logger.info(f"Attempting to connect to server at {args.host}:{args.port}...") try: transport.open() logger.info("Successfully connected to server") except TTransport.TTransportException as e: logger.error(f"Connection failed: {str(e)}") print(f"Error: Could not connect to server at {args.host}:{args.port}") print(f"Reason: {str(e)}") sys.exit(1) # Add connection test try: client.ping() logger.info("Server connection test successful") except Exception as e: logger.error(f"Server connection test failed: {e}") raise except TTransport.TTransportException as e: logger.error(f"Connection failed: {str(e)}") logger.error(f"Could not connect to {args.host}:{args.port}") sys.exit(1) except Exception as e: logger.error(f"Connection failed: {str(e)}") logger.error(traceback.format_exc()) sys.exit(1) if args.command == 'getToken': url = args.url # format_codes removed # Handle format listing if args.list_formats: formats = list_available_formats(url, args) if formats: print("\nAvailable formats:") print(tabulate(formats, headers="keys", showindex=True)) # Show index for format selection # Suggest best formats based on resolution best_formats = suggest_best_formats(formats) if best_formats: print("\nSuggested formats:") print(tabulate(best_formats, headers="keys")) else: print("No formats available or could not retrieve format information") return elif args.youtube_url: url = args.youtube_url format_code = args.format print("Warning: --youtube-url is deprecated, use 'getToken --url' instead") else: print("Please provide a YouTube URL using 'getToken --url' command") return # Get token for URL try: # Get token for URL logger.info(f"Requesting token for URL: {url}") token_data = client.getOrRefreshToken( accountId=args.account_id, updateType=TokenUpdateMode.AUTO, url=url ) if not token_data: logger.error("Received empty token data from server") print("Error: Received empty token data from server") sys.exit(1) # Validate token data if not hasattr(token_data, 'ytdlpCommand') or not token_data.ytdlpCommand: logger.error("Token data missing required ytdlpCommand") print("Error: Token data missing required ytdlpCommand") sys.exit(1) logger.info("Successfully received token data from server") # Log all attributes of token_data for debugging token_attrs = [attr for attr in dir(token_data) if not attr.startswith('__') and not callable(getattr(token_data, attr))] logger.debug(f"Received token_data attributes: {token_attrs}") # Handle case where token_data is a dict-like object if hasattr(token_data, 'items'): # Convert to dict if needed token_dict = dict(token_data.items()) logger.debug(f"Token data as dict: {token_dict}") # If we have JSON data directly in the response if isinstance(token_dict.get('infoJson', None), str): received_info_json = token_dict['infoJson'] elif isinstance(token_dict.get('data', None), (dict, str)): # Try to use the data field if it exists data = token_dict['data'] if isinstance(data, str): received_info_json = data else: received_info_json = json.dumps(data) else: # Create info.json from available fields info_data = { "id": token_dict.get('id', extract_video_id(url)), "title": token_dict.get('title', ''), "formats": token_dict.get('formats', []), "timestamp": int(time.time()), "ytdlp_command": token_dict.get('ytdlpCommand', '') } received_info_json = json.dumps(info_data) else: # Handle case where token_data is a regular object received_info_json = getattr(token_data, 'infoJson', None) if received_info_json: logger.debug(f"Received info.json data ({len(received_info_json)} bytes)") if len(received_info_json) > 100: logger.debug(f"Preview: {received_info_json[:100]}...") else: logger.warning("No valid info.json data found in response") except PBServiceException as e: logger.error(f"Service exception: {e.message}") if hasattr(e, 'errorCode'): if e.errorCode == "BOT_DETECTED": print(f"Error: {e.message}") print("\nYouTube has detected bot activity. Authentication is required.") # Print suggestions if available if hasattr(e, 'context') and e.context and 'suggestions' in e.context: print("\nSuggestions:") for i, suggestion in enumerate(e.context['suggestions'], 1): print(f" {i}. {suggestion}") else: print("\nTry:") print(" 1. Use --cookies-from-browser to pass authentication cookies") print(" 2. Export cookies from a logged-in browser session") print(" 3. Try a different client type (ios, android, mweb)") print(" 4. Use a different proxy or IP address") print(" 5. Try again later") sys.exit(1) elif e.errorCode in ["SOCKS5_CONNECTION_FAILED", "SOCKS5_TIMEOUT", "SOCKS5_CONNECTION_REFUSED", "SOCKS5_CONNECTION_TIMEOUT", "SOCKS5_HOST_NOT_FOUND", "SOCKS5_NETWORK_UNREACHABLE"]: print(f"Error: {e.message}") print("\nSOCKS5 proxy connection failed. Please check your proxy settings.") # Provide more specific guidance based on error code if e.errorCode == "SOCKS5_TIMEOUT" or e.errorCode == "SOCKS5_CONNECTION_TIMEOUT": print("The proxy server did not respond within the timeout period (9 seconds).") print("This could indicate network congestion or a proxy server that's overloaded.") elif e.errorCode == "SOCKS5_CONNECTION_REFUSED": print("The proxy server actively refused the connection.") print("This usually means the proxy server is not running or is not accepting connections on the specified port.") elif e.errorCode == "SOCKS5_HOST_NOT_FOUND": print("The proxy host could not be resolved.") print("Please check that the hostname is correct and your DNS is working properly.") elif e.errorCode == "SOCKS5_NETWORK_UNREACHABLE": print("The network containing the proxy server is unreachable.") print("This could indicate network routing issues or firewall restrictions.") print("\nPossible solutions:") print("1. Try using a different proxy server") print("2. Check if the proxy server is running and accessible") print("3. Verify your network connection and firewall settings") print("4. If using a remote proxy, check if it's accessible from your location") # Exit with a specific error code for proxy failures sys.exit(2) elif e.errorCode == "GLOBAL_TIMEOUT": print(f"Error: {e.message}") print("\nThe server timed out while processing your request.") print("This could be due to:") print("1. Slow network connection") print("2. Server overload") print("3. Complex video that takes too long to process") print("\nTry again later or with a different video.") sys.exit(3) elif e.errorCode == "CLIENT_TIMEOUT": print(f"Error: {e.message}") print("\nA client-specific timeout occurred while processing your request.") print("The server has stopped processing to avoid wasting resources.") print("\nPossible solutions:") print("1. Try again later when network conditions improve") print("2. Try a different video") print("3. Check your internet connection") sys.exit(3) else: print(f"Error: {e.message}") else: print(f"Error: {e.message}") return except PBUserException as e: logger.error(f"User exception: {e.message}") print(f"Error: {e.message}") return except Exception as e: logger.error(f"Unexpected error: {str(e)}") logger.error(traceback.format_exc()) print(f"Unexpected error: {str(e)}") sys.exit(1) # Log the entire token_data object for debugging AFTER potential exceptions logger.debug(f"Processing received token_data: {token_data}") # Check if valid infoJson was received from the server info_json = None if hasattr(token_data, 'infoJson') and token_data.infoJson and token_data.infoJson != "{}": if is_valid_json(token_data.infoJson): logger.debug("Valid info.json received from server.") # Changed to DEBUG info_json = token_data.infoJson else: logger.warning("Received infoJson from server, but it is not valid JSON or is empty.") else: logger.warning("Valid info.json was NOT received from the server.") # Proceed only if we have valid info_json if info_json: # Save info.json if present in the server response video_id = extract_video_id(url) if not video_id: logger.warning(f"Could not extract video ID from URL: {url}") # Keep as WARNING video_id = f"unknown_{int(time.time())}" try: info_data = json.loads(info_json) # Check if it contains an error if isinstance(info_data, dict) and ('error' in info_data or 'errorCode' in info_data): error_msg = info_data.get('error', 'Unknown error') error_code = info_data.get('errorCode', 'UNKNOWN_ERROR') logger.warning(f"infoJson contains error: {error_msg} (code: {error_code})") # If it's a bot detection error, raise appropriate exception if error_code == 'BOT_DETECTED' or 'bot' in error_msg.lower() or 'sign in' in error_msg.lower(): raise PBUserException( message=f"Bot detection triggered: {error_msg}", errorCode="BOT_DETECTION", context={ "video_id": extract_video_id(url), "url": url, "suggestions": info_data.get('suggestions', ["Try different client", "Use proxy", "Wait and retry later"]) } ) except json.JSONDecodeError as e: # This case should ideally not happen due to is_valid_json check, but handle defensively logger.error(f"Invalid JSON received despite initial check: {e}") print(f"Error: Received invalid JSON data from server.") info_json = None # Ensure we don't proceed # If info_json is still None after checks, handle the failure case if not info_json: logger.error("Failed to obtain valid info.json from the server.") print("Error: No valid video information (info.json) was received from the server.") # Optionally, print the raw ytdlp command if available if hasattr(token_data, 'ytdlpCommand') and token_data.ytdlpCommand: print("\nRaw command from server (may be incomplete or require info.json):") print(token_data.ytdlpCommand) sys.exit(1) # Exit with error # --- We have valid info_json, proceed with saving and command generation --- try: info_data = json.loads(info_json) # We know this is valid now # Check if it's an error response embedded in the JSON if isinstance(info_data, dict) and "error" in info_data: logger.error(f"Received error report from server: {info_json}") # Check if this is a bot detection error if (info_data.get('errorCode') == "BOT_DETECTED" or "bot" in info_data.get('message', '').lower() or "sign in to confirm" in info_data.get('message', '').lower() or "sign in to confirm" in info_data.get('error', '').lower() or "unusual traffic" in info_data.get('message', '').lower() or "captcha" in info_data.get('message', '').lower() or info_data.get('requires_auth') == True): logger.error("Bot detection error detected in info.json") # Raise PBServiceException for bot detection raise PBServiceException( message=f"Bot detection triggered: {info_data.get('message', 'Authentication required')}", errorCode="BOT_DETECTED", context={ "video_id": video_id, "url": url, "requires_auth": True, "info_data": info_data, "suggestions": info_data.get('suggestions', [ "Use --cookies-from-browser to pass authentication cookies", "Export cookies from a logged-in browser session", "Try a different client type (ios, android, mweb)", "Use a different proxy or IP address" ]) } ) else: # Raise PBServiceException for other errors raise PBServiceException( message=f"Error extracting video info: {info_data.get('error', 'Unknown error')}", errorCode=info_data.get('errorCode', "EXTRACTION_FAILED"), context={"video_id": video_id, "url": url, "info_data": info_data} ) # If it's a valid response, process it if 'title' in info_data or 'id' in info_data: print(f"Video info retrieved: {info_data.get('title', 'Unknown title')}") saved_path = save_info_json(info_json, video_id, args.context_dir) if saved_path: print(f"info.json saved to: {saved_path}") # Create simpler base command using only the saved info.json and proxy base_cmd = f"yt-dlp --load-info-json \"{saved_path}\"" # Quote the path if hasattr(token_data, 'socks') and token_data.socks: if token_data.socks.startswith(('socks5://', 'ss://')): # Quote the proxy URL as well base_cmd += f" --proxy \"{token_data.socks}\"" # Show format listing command print("\nTo list available formats:") format_cmd = f"{base_cmd} -F" print(format_cmd) # Show download command (format is usually embedded in info.json or determined by yt-dlp) simplified_cmd = f"{base_cmd} --simulate" # Removed format codes print("\nTo download (with --simulate to preview):") print(simplified_cmd) print("\nRemove --simulate to actually download") else: logger.error("Failed to save info.json file") print("Failed to save info.json file") else: logger.warning("info.json appears to be valid JSON but missing expected video fields") print("Error: Received incomplete or invalid video data") print("This usually indicates an authentication or access issue") sys.exit(1) except Exception as e: # Catch errors during saving or command generation logger.error(f"Error processing valid info.json: {str(e)}") # Re-raise the exception to be handled by the main error handler raise finally: if transport: transport.close() if __name__ == "__main__": main()