#!/usr/bin/env python3 """ Tool to get info.json from the Thrift service. """ import argparse import json import os import re import sys import logging import codecs from datetime import datetime from typing import Dict, Any, Optional # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) # Note: The CLI entrypoint will configure the root logger. # We get our own logger here for namespacing. logger = logging.getLogger('get_info_tool') # Import Thrift modules # Add project's thrift gen_py path to allow importing 'pangramia' script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(script_dir, '..')) sys.path.insert(0, os.path.join(project_root, 'thrift_model', 'gen_py')) from thrift.transport import TTransport from pangramia.yt.common.ttypes import TokenUpdateMode from pangramia.yt.exceptions.ttypes import PBServiceException, PBUserException from yt_ops_services.client_utils import get_thrift_client from ytops_client.request_params_help import REQUEST_PARAMS_HELP_STRING def get_video_id(url: str) -> str: """Extracts a YouTube video ID from a URL.""" # For URLs like https://www.youtube.com/watch?v=VIDEO_ID match = re.search(r"v=([0-9A-Za-z_-]{11})", url) if match: return match.group(1) # For URLs like https://youtu.be/VIDEO_ID match = re.search(r"youtu\.be\/([0-9A-Za-z_-]{11})", url) if match: return match.group(1) # For plain video IDs if re.fullmatch(r'[0-9A-Za-z_-]{11}', url): return url return "unknown_video_id" def parse_key_value_params(params_str: str) -> Dict[str, Any]: """Parses a comma-separated string of key=value pairs into a nested dict.""" params = {} if not params_str: return params for pair in params_str.split(','): if '=' not in pair: logger.warning(f"Skipping malformed parameter pair: {pair}") continue key, value_str = pair.split('=', 1) keys = key.strip().split('.') # Try to parse value as JSON primitive, otherwise treat as string try: # Don't parse if it's quoted, treat as string if (value_str.startswith('"') and value_str.endswith('"')) or \ (value_str.startswith("'") and value_str.endswith("'")): value = value_str[1:-1] else: value = json.loads(value_str) except json.JSONDecodeError: value = value_str d = params for k in keys[:-1]: if k not in d or not isinstance(d[k], dict): d[k] = {} d = d[k] d[keys[-1]] = value return params def add_get_info_parser(subparsers): """Add the parser for the 'get-info' command.""" parser = subparsers.add_parser( 'get-info', description='Get info.json from Thrift service', formatter_class=argparse.RawTextHelpFormatter, help='Get info.json from the Thrift service.' ) parser.add_argument('url', help='YouTube URL or video ID') parser.add_argument('--host', default='127.0.0.1', help="Thrift server host. Using 127.0.0.1 avoids harmless connection errors when the local Envoy proxy only listens on IPv4.") parser.add_argument('--port', type=int, default=9080, help='Thrift server port') parser.add_argument('--auth-host', help='Thrift server host (overrides --host).') parser.add_argument('--auth-port', type=int, help='Thrift server port (overrides --port).') parser.add_argument('--profile', default='default_profile', help='The profile name (accountId) to use for the request.') parser.add_argument('--client', help='''Specific client to use. Overrides server default. Available clients: web, web_safari, web_embedded, web_music, web_creator, mweb android, android_music, android_creator, android_vr ios, ios_music, ios_creator tv, tv_simply, tv_embedded Append "_camoufox" to any client name (e.g., "web_camoufox") to force the browser-based generation strategy.''') parser.add_argument('--output', help='Output file path for the info.json. If not provided, prints to stdout.') parser.add_argument('--output-auto', action='store_true', help='Automatically generate output filename for info.json and invocation data. Format: DATETIME-CLIENT-VIDEOID-info.json') parser.add_argument('--output-auto-url-only', action='store_true', help='Automatically generate output filename for info.json (format: VIDEOID-info.json) and also save a copy to latest-info.json.') parser.add_argument('--output-auto-suffix', help='Suffix to add to the filename before "-info.json" when using --output-auto or --output-auto-url-only. E.g., "-cycle1".') parser.add_argument('--log-file-auto', action='store_true', help='Automatically generate a log filename and save all script logs to it. Format: VIDEOID-DATETIME.log') parser.add_argument('--machine-id', help='Identifier for the client machine. Defaults to hostname.') parser.add_argument('--worker-id', help='Identifier for a worker process. Used for naming files with --save-latest.') parser.add_argument('--save-latest', action='store_true', help='Save a copy of the info.json to latest-info.json or [worker-id]-latest-info.json. This is implied by --output-auto-url-only.') parser.add_argument('--assigned-proxy-url', help='A specific proxy URL to use for the request, overriding the server\'s proxy pool logic.') parser.add_argument('--proxy-rename', help='Apply sed-style regex substitution to the assigned proxy URL. Format: s/pattern/replacement/') parser.add_argument('--print-proxy', action='store_true', help='Print the proxy used for the request to stderr.') parser.add_argument('--verbose', action='store_true', help='Enable verbose output') parser.add_argument('--log-return', action='store_true', help='Log the full summary of the thrift response to stderr, including detailed logs.\nThis is a convenience flag that implies --show-prefetch-log, --show-nodejs-log, and --show-ytdlp-log.') parser.add_argument('--show-prefetch-log', action='store_true', help='Print the curl pre-fetch log from the server response.') parser.add_argument('--show-nodejs-log', action='store_true', help='Print the Node.js debug log from the server response.') parser.add_argument('--show-ytdlp-log', action='store_true', help='Print the yt-dlp debug log from the server response.') parser.add_argument('--direct', action='store_true', help='Use the direct yt-dlp info.json generation method, bypassing Node.js token generation.') parser.add_argument('--print-info-out', action='store_true', help='Print the final info.json to stdout. By default, output is suppressed unless writing to a file.') parser.add_argument('--request-params-json', help=REQUEST_PARAMS_HELP_STRING + '\nCan also be a comma-separated string of key=value pairs (e.g., "caching_policy.mode=force_refresh").') parser.add_argument('--force-renew', help='Comma-separated list of items to force-renew: cookies, visitor_id, po_token, nsig_cache, info_json, all.') parser.add_argument('--lang', help='Language code for the request (e.g., "fr", "ja"). Affects metadata language.') parser.add_argument('--timezone', help='Timezone for the request (e.g., "UTC", "America/New_York"). Note: experimental, may not be fully supported.') return parser def main_get_info(args): """Main logic for the 'get-info' command.""" exit_code = 0 # Set log level if args.verbose: logging.getLogger().setLevel(logging.DEBUG) if args.log_file_auto: video_id = get_video_id(args.url) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') log_filename = f"{video_id}-{timestamp}.log" # Get root logger to add file handler root_logger = logging.getLogger() file_handler = logging.FileHandler(log_filename) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(formatter) root_logger.addHandler(file_handler) logger.info(f"Logging to file: {log_filename}") transport = None try: # Determine host and port, giving precedence to --auth-* args host = args.auth_host or args.host port = args.auth_port or args.port # Create Thrift client client, transport = get_thrift_client(host, port) # Get token data, which includes the info.json if args.direct: logger.info(f"Requesting info.json for URL '{args.url}' using DIRECT method.") if args.client: logger.info(f"Requesting to use specific client(s): {args.client}") else: logger.info("No specific client requested, server will let yt-dlp decide.") token_data = client.getInfoJsonDirect(url=args.url, clients=args.client) else: logger.info(f"Requesting info.json for URL '{args.url}' using profile '{args.profile}'") # Prepare arguments for the Thrift call machine_id = args.machine_id if not machine_id: import socket machine_id = socket.gethostname() logger.info(f"No machine ID provided, using hostname: {machine_id}") request_params = {} if args.request_params_json: try: request_params = json.loads(args.request_params_json) except json.JSONDecodeError: logger.info("Could not parse --request-params-json as JSON, trying as key-value string.") request_params = parse_key_value_params(args.request_params_json) if args.force_renew: items_to_renew = [item.strip() for item in args.force_renew.split(',')] request_params['force_renew'] = items_to_renew logger.info(f"Requesting force renew for: {items_to_renew}") if args.lang: session_params = request_params.setdefault('session_params', {}) session_params['lang'] = args.lang logger.info(f"Requesting language: {args.lang}") if args.timezone: session_params = request_params.setdefault('session_params', {}) session_params['timeZone'] = args.timezone logger.info(f"Requesting timezone: {args.timezone}") if args.verbose: # Add verbose flag for yt-dlp on the server ytdlp_params = request_params.setdefault('ytdlp_params', {}) ytdlp_params['verbose'] = True logger.info("Verbose mode enabled, requesting verbose yt-dlp logs from server.") thrift_args = { 'accountId': args.profile, 'updateType': TokenUpdateMode.AUTO, 'url': args.url, 'clients': args.client, 'machineId': machine_id, 'airflowLogContext': None, 'requestParamsJson': json.dumps(request_params) if request_params else None, 'assignedProxyUrl': args.assigned_proxy_url } # Handle proxy renaming assigned_proxy = args.assigned_proxy_url if assigned_proxy and args.proxy_rename: rename_rule = args.proxy_rename.strip("'\"") if rename_rule.startswith('s/') and rename_rule.count('/') >= 2: try: parts = rename_rule.split('/') pattern = parts[1] replacement = parts[2] original_proxy = assigned_proxy assigned_proxy = re.sub(pattern, replacement, assigned_proxy) logger.info(f"Renamed proxy URL from '{original_proxy}' to '{assigned_proxy}' using rule '{rename_rule}'") except re.error as e: logger.error(f"Invalid regex in --proxy-rename: {e}") return 1 except IndexError: logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") return 1 else: logger.error("Invalid --proxy-rename format. Expected: s/pattern/replacement/") return 1 thrift_args['assignedProxyUrl'] = assigned_proxy if args.client: logger.info(f"Requesting to use specific client: {args.client}") else: logger.info("No specific client requested, server will use its default.") token_data = client.getOrRefreshToken(**thrift_args) if args.print_proxy: if hasattr(token_data, 'socks') and token_data.socks: print(f"Proxy used: {token_data.socks}", file=sys.stderr) else: print("Proxy information not available in response.", file=sys.stderr) if not token_data or not hasattr(token_data, 'infoJson') or not token_data.infoJson: logger.error("Server did not return valid info.json data.") if args.verbose: logger.debug(f"Received token_data from server: {token_data!r}") if not token_data: logger.error("Reason: The entire token_data object received from the server is null.") elif not hasattr(token_data, 'infoJson'): logger.error("Reason: The received token_data object does not have an 'infoJson' attribute.") elif not token_data.infoJson: logger.error("Reason: The 'infoJson' attribute in the received token_data object is empty or null.") print("Error: Server did not return valid info.json data.", file=sys.stderr) return 1 info_json_str = token_data.infoJson # On success, print summary info to stderr for visibility. # This provides immediate feedback without interfering with piped stdout. if hasattr(token_data, 'serverVersionInfo') and token_data.serverVersionInfo: # Filter out the default params line as requested filtered_info = '\n'.join( line for line in token_data.serverVersionInfo.split('\n') if 'Default yt-dlp CLI params:' not in line ) print(f"\n--- Server Version Info ---\n{filtered_info}", file=sys.stderr) info_json_str = token_data.infoJson info_data_for_analysis: Optional[Dict[str, Any]] = None try: info_data_for_analysis = json.loads(info_json_str) except (json.JSONDecodeError, TypeError): pass # Will be handled later if info_json is invalid if hasattr(token_data, 'requestSummary') and token_data.requestSummary: try: summary_data = json.loads(token_data.requestSummary) summary_text = summary_data.get('summary', token_data.requestSummary) # --- Client-side summary correction and enhancement --- gvs_pot_used = False if isinstance(info_data_for_analysis, dict): for f in info_data_for_analysis.get('formats', []): if 'pot=' in f.get('url', ''): gvs_pot_used = True break if gvs_pot_used and 'PO Token (GVS): not_fetched' in summary_text: summary_text = summary_text.replace( 'PO Token (GVS): not_fetched', 'PO Token (GVS): bgutil:http (verified from format URL)' ) if 'Visitor ID Source: omitted_for_tv_client' in summary_text: summary_text = summary_text.replace( 'Visitor ID Source: omitted_for_tv_client', 'Visitor ID Source: omitted_for_tv_client (handled internally by yt-dlp)' ) # Add a note that we cannot display it. summary_text += "\n - Visitor ID Value: Not exposed by server for TV clients to avoid detection." print(f"\n--- Request Summary ---\n{summary_text}", file=sys.stderr) except json.JSONDecodeError: # Fallback for old format or non-JSON summary print(f"\n--- Request Summary ---\n{token_data.requestSummary}", file=sys.stderr) # Print detailed logs only if explicitly requested if hasattr(token_data, 'requestSummary') and token_data.requestSummary: try: summary_data = json.loads(token_data.requestSummary) if args.show_prefetch_log or args.log_return: print("\n--- Prefetch Log ---", file=sys.stderr) print(summary_data.get('prefetch_log', 'Not available.'), file=sys.stderr) if args.show_nodejs_log or args.log_return: print("\n--- Node.js Log ---", file=sys.stderr) print(summary_data.get('nodejs_log', 'Not available.'), file=sys.stderr) if args.show_ytdlp_log or args.log_return: print("\n--- yt-dlp Log ---", file=sys.stderr) print(summary_data.get('ytdlp_log', 'Not available.'), file=sys.stderr) except json.JSONDecodeError: pass # Fallback already handled above if hasattr(token_data, 'communicationLogPaths') and token_data.communicationLogPaths: logger.info("--- Communication Log Paths ---") for log_path in token_data.communicationLogPaths: logger.info(f" - {log_path}") # Check if the returned info.json is an error report try: info_data = json.loads(info_json_str) if hasattr(token_data, 'socks') and token_data.socks: info_data['_proxy_url'] = token_data.socks if isinstance(info_data, dict) and 'error' in info_data: error_code = info_data.get('errorCode', 'N/A') error_message = info_data.get('message', info_data.get('error', 'Unknown error')) logger.error(f"Server returned an error in info.json (Code: {error_code}): {error_message}") print(f"Error from server (Code: {error_code}): {error_message}", file=sys.stderr) # Optionally print the full error JSON if args.verbose: print(json.dumps(info_data, indent=2), file=sys.stderr) exit_code = 1 except json.JSONDecodeError: logger.error(f"Failed to parse info.json from server: {info_json_str[:200]}...") print("Error: Failed to parse the info.json response from the server.", file=sys.stderr) return 1 logger.info(f"Successfully retrieved info.json ({len(info_json_str)} bytes)") # Save to latest-info.json if requested, or if using --output-auto-url-only for convenience if args.save_latest or args.output_auto_url_only: base_latest_filename = f"{args.worker_id}-latest" if args.worker_id else "latest" latest_info_filename = f"{base_latest_filename}-info.json" latest_proxy_filename = f"{base_latest_filename}-proxy.txt" try: with open(latest_info_filename, 'w', encoding='utf-8') as f: json.dump(info_data, f, indent=2) logger.info(f"Wrote info.json to {latest_info_filename}") print(f"Successfully saved info.json to {latest_info_filename}", file=sys.stderr) except IOError as e: logger.error(f"Failed to write to {latest_info_filename}: {e}") print(f"Error: Failed to write to {latest_info_filename}: {e}", file=sys.stderr) if hasattr(token_data, 'socks') and token_data.socks: try: with open(latest_proxy_filename, 'w', encoding='utf-8') as f: f.write(token_data.socks + '\n') logger.info(f"Wrote proxy to {latest_proxy_filename}") print(f"Successfully saved proxy to {latest_proxy_filename}", file=sys.stderr) except IOError as e: logger.error(f"Failed to write to {latest_proxy_filename}: {e}") print(f"Error: Failed to write to {latest_proxy_filename}: {e}", file=sys.stderr) # Determine output file path if auto-naming is used output_file = args.output if args.output_auto or args.output_auto_url_only: video_id = get_video_id(args.url) suffix = args.output_auto_suffix or "" if args.output_auto: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') client_id = args.client or args.profile base_filename = f"{timestamp}-{client_id}-{video_id}{suffix}" output_file = f"{base_filename}-info.json" # Save invocation data invocation_filename = f"{base_filename}-invocation.json" invocation_data = {} for attr in ['ytdlpCommand', 'socks', 'jobId', 'url', 'requestSummary', 'communicationLogPaths']: if hasattr(token_data, attr): value = getattr(token_data, attr) if value: invocation_data[attr] = value if hasattr(token_data, 'cookiesBlob') and token_data.cookiesBlob: invocation_data['cookiesBlob'] = f"present, {len(token_data.cookiesBlob)} bytes" else: invocation_data['cookiesBlob'] = "not present" try: with open(invocation_filename, 'w', encoding='utf-8') as f: json.dump(invocation_data, f, indent=2) logger.info(f"Wrote invocation data to {invocation_filename}") except IOError as e: logger.error(f"Failed to write invocation data to {invocation_filename}: {e}") else: # args.output_auto_url_only output_file = f"{video_id}{suffix}-info.json" # Write to output file if specified if output_file: try: # Ensure the output directory exists before writing the file output_dir = os.path.dirname(output_file) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: # Pretty-print the JSON to the file json.dump(info_data, f, indent=2) logger.info(f"Wrote info.json to {output_file}") # Print success message to stderr to not interfere with stdout piping print(f"Successfully saved info.json to {output_file}", file=sys.stderr) # If --output-auto, save invocation data if args.output_auto: pass # The latest-info.json logic is now handled by --save-latest except IOError as e: logger.error(f"Failed to write to output file {output_file}: {e}") print(f"Error: Failed to write to output file {output_file}: {e}", file=sys.stderr) return 1 # Print the JSON to stdout if requested, to allow for piping. if args.print_info_out: print(json.dumps(info_data, indent=2)) return exit_code except (PBServiceException, PBUserException) as e: # Check for non-fatal age-gate errors. These are expected for certain videos # and should not cause the entire stress test to fail. is_age_gate_error = hasattr(e, 'errorCode') and e.errorCode == 'AGE_GATED_SIGN_IN' if is_age_gate_error: logger.warning(f"Age-gated content detected for URL '{args.url}'. Treating as a non-fatal warning.") print(f"Warning: Age-gated content detected for '{args.url}'.", file=sys.stderr) # To avoid breaking downstream parsers, output a valid JSON error object. # This allows stress testers to see a 'success' (exit 0) but still know it was an age gate issue. error_json = { "error": "Age-gated content", "errorCode": "AGE_GATE", "message": "Sign in to confirm your age." } print(json.dumps(error_json, indent=2)) # We return success because this is not a system failure. return 0 # Format message for better readability, ensuring newlines are handled. message = str(e.message or '') try: # Attempt to decode as if it has escaped newlines (e.g., '\\n' -> '\n') message = codecs.decode(message, 'unicode_escape') except Exception: # Fallback for safety, though unicode_escape is robust message = message.replace('\\n', '\n') # For known user-facing errors, suppress the full traceback unless verbose is explicitly on. # The goal is to provide a clean error message for common issues. user_facing_errors = [ "BOT_DETECTED", "BOT_DETECTION_SIGN_IN_REQUIRED", "VIDEO_UNAVAILABLE", "PRIVATE_VIDEO", "VIDEO_REMOVED", "AGE_GATED_SIGN_IN", "MEMBERS_ONLY", "VIDEO_PROCESSING", "GEO_RESTRICTED" ] is_user_facing_error = hasattr(e, 'errorCode') and e.errorCode in user_facing_errors # Only show full traceback in verbose mode AND if it's NOT a common user-facing error. show_exc_info = args.verbose and not is_user_facing_error logger.error(f"A Thrift error occurred: {message}", exc_info=show_exc_info) print(f"\n--- ERROR ---", file=sys.stderr) print(f"{message}", file=sys.stderr) if hasattr(e, 'context') and e.context and (args.verbose or not is_user_facing_error): print(f"\n--- CONTEXT ---", file=sys.stderr) # The context is a dict from thrift. Pretty print it, handling newlines in values. if isinstance(e.context, dict): # Process each value to un-escape newlines for clean printing processed_context = {} for key, value in e.context.items(): try: processed_context[key] = codecs.decode(str(value), 'unicode_escape') except Exception: processed_context[key] = str(value).replace('\\n', '\n') print(json.dumps(processed_context, indent=2), file=sys.stderr) else: # Fallback for non-dict context print(str(e.context), file=sys.stderr) print("\n", file=sys.stderr) return 1 except TTransport.TTransportException as e: logger.error(f"Connection to server failed: {e}", exc_info=args.verbose) print(f"Error: Connection to server at {args.host}:{args.port} failed.", file=sys.stderr) return 1 except Exception as e: logger.exception(f"An unexpected error occurred: {e}") print(f"An unexpected error occurred: {e}", file=sys.stderr) return 1 finally: if transport and transport.isOpen(): transport.close() logger.info("Thrift connection closed.")